CA4 (Machine Learning)
Training different classifiers on a dataset and test models.
University of Tehran
810100216
Imports¶
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from copy import deepcopy
from IPython.display import display, HTML
from sklearn import metrics
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from mlxtend.evaluate import bias_variance_decomp
from dataclasses import dataclass
from xgboost import XGBClassifier
TARGET_COLUMN = 'NumPurchases'
Explorin Dataset¶
Reading a CSV File into a Pandas DataFrame¶
df = pd.read_csv('marketing_campaign.csv')
pd.set_option("display.max_columns", None)
df.head(5)
| Unnamed: 0 | ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntCoffee | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumWebVisitsMonth | Complain | NumPurchases | UsedCampaignOffer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 5524 | 1957 | Graduation | Single | 58138.0 | 0 | 0 | 04-09-2012 | 58 | 635.0 | 88 | 546 | 172 | 88 | 88.0 | NaN | 0 | 25 | 1 |
| 1 | 1 | 2174 | 1954 | Graduation | Single | 46344.0 | 1 | 1 | 08-03-2014 | 38 | NaN | 1 | 6 | 2 | 1 | 6.0 | 5.0 | 0 | 6 | 0 |
| 2 | 2 | 4141 | 1965 | Graduation | Together | 71613.0 | 0 | 0 | 21-08-2013 | 26 | NaN | 49 | 127 | 111 | 21 | 42.0 | NaN | 0 | 21 | 0 |
| 3 | 3 | 6182 | 1984 | Graduation | Together | 26646.0 | 1 | 0 | 10-02-2014 | 26 | 11.0 | 4 | 20 | 10 | 3 | 5.0 | 6.0 | 0 | 8 | 0 |
| 4 | 4 | 5324 | 1981 | PhD | Married | 58293.0 | 1 | 0 | 19-01-2014 | 94 | 173.0 | 43 | 118 | 46 | 27 | 15.0 | 5.0 | 0 | 19 | 0 |
DataFrame Information¶
The info method shows us the general info about our data frame—for example, the data stored in a dataset and the corresponding data types.
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2240 entries, 0 to 2239 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 2240 non-null int64 1 ID 2240 non-null int64 2 Year_Birth 2240 non-null int64 3 Education 2240 non-null object 4 Marital_Status 2240 non-null object 5 Income 2017 non-null float64 6 Kidhome 2240 non-null int64 7 Teenhome 2240 non-null int64 8 Dt_Customer 2240 non-null object 9 Recency 2240 non-null int64 10 MntCoffee 2035 non-null float64 11 MntFruits 2240 non-null int64 12 MntMeatProducts 2240 non-null int64 13 MntFishProducts 2240 non-null int64 14 MntSweetProducts 2240 non-null int64 15 MntGoldProds 2227 non-null float64 16 NumWebVisitsMonth 2040 non-null float64 17 Complain 2240 non-null int64 18 NumPurchases 2240 non-null int64 19 UsedCampaignOffer 2240 non-null int64 dtypes: float64(4), int64(13), object(3) memory usage: 350.1+ KB
df.describe()
| Unnamed: 0 | ID | Year_Birth | Income | Kidhome | Teenhome | Recency | MntCoffee | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumWebVisitsMonth | Complain | NumPurchases | UsedCampaignOffer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2240.000000 | 2240.000000 | 2240.000000 | 2017.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2035.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2227.000000 | 2040.000000 | 2240.000000 | 2240.000000 | 2240.000000 |
| mean | 1119.500000 | 5592.159821 | 1968.805804 | 52297.080317 | 0.437946 | 0.506250 | 49.109375 | 304.239312 | 26.302232 | 166.950000 | 37.525446 | 27.062946 | 43.847777 | 5.326961 | 0.009375 | 14.862054 | 0.271875 |
| std | 646.776623 | 3246.662198 | 11.984069 | 25543.108215 | 0.563666 | 0.544538 | 28.962453 | 337.515534 | 39.773434 | 225.715373 | 54.628979 | 41.280498 | 51.897098 | 2.439349 | 0.096391 | 7.677173 | 0.445025 |
| min | 0.000000 | 0.000000 | 1893.000000 | 2447.000000 | -5.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 559.750000 | 2828.250000 | 1959.000000 | 35340.000000 | 0.000000 | 0.000000 | 24.000000 | 23.000000 | 1.000000 | 16.000000 | 3.000000 | 1.000000 | 9.000000 | 3.000000 | 0.000000 | 8.000000 | 0.000000 |
| 50% | 1119.500000 | 5458.500000 | 1970.000000 | 51369.000000 | 0.000000 | 0.000000 | 49.000000 | 177.000000 | 8.000000 | 67.000000 | 12.000000 | 8.000000 | 24.000000 | 6.000000 | 0.000000 | 15.000000 | 0.000000 |
| 75% | 1679.250000 | 8427.750000 | 1977.000000 | 68316.000000 | 1.000000 | 1.000000 | 74.000000 | 505.000000 | 33.000000 | 232.000000 | 50.000000 | 33.000000 | 56.000000 | 7.000000 | 0.000000 | 21.000000 | 1.000000 |
| max | 2239.000000 | 11191.000000 | 1996.000000 | 666666.000000 | 2.000000 | 2.000000 | 99.000000 | 1493.000000 | 199.000000 | 1725.000000 | 259.000000 | 263.000000 | 362.000000 | 20.000000 | 1.000000 | 44.000000 | 1.000000 |
Count and Percentage of Missing Data for Each Feature¶
def missing_values(df: pd.DataFrame) -> pd.DataFrame:
nan_vals_count = df.isna().sum()
nan_vals_percent = nan_vals_count / len(df)
nan_values = pd.concat([nan_vals_count, nan_vals_percent], axis=1, keys=["Missing", "Percentage"])
return nan_values
missing_values(df)
| Missing | Percentage | |
|---|---|---|
| Unnamed: 0 | 0 | 0.000000 |
| ID | 0 | 0.000000 |
| Year_Birth | 0 | 0.000000 |
| Education | 0 | 0.000000 |
| Marital_Status | 0 | 0.000000 |
| Income | 223 | 0.099554 |
| Kidhome | 0 | 0.000000 |
| Teenhome | 0 | 0.000000 |
| Dt_Customer | 0 | 0.000000 |
| Recency | 0 | 0.000000 |
| MntCoffee | 205 | 0.091518 |
| MntFruits | 0 | 0.000000 |
| MntMeatProducts | 0 | 0.000000 |
| MntFishProducts | 0 | 0.000000 |
| MntSweetProducts | 0 | 0.000000 |
| MntGoldProds | 13 | 0.005804 |
| NumWebVisitsMonth | 200 | 0.089286 |
| Complain | 0 | 0.000000 |
| NumPurchases | 0 | 0.000000 |
| UsedCampaignOffer | 0 | 0.000000 |
missing_data = pd.DataFrame({
'Feature': df.columns,
'Missing Count': df.isnull().sum(),
'Missing Percentage': df.isnull().mean() * 100
})
missing_data
| Feature | Missing Count | Missing Percentage | |
|---|---|---|---|
| Unnamed: 0 | Unnamed: 0 | 0 | 0.000000 |
| ID | ID | 0 | 0.000000 |
| Year_Birth | Year_Birth | 0 | 0.000000 |
| Education | Education | 0 | 0.000000 |
| Marital_Status | Marital_Status | 0 | 0.000000 |
| Income | Income | 223 | 9.955357 |
| Kidhome | Kidhome | 0 | 0.000000 |
| Teenhome | Teenhome | 0 | 0.000000 |
| Dt_Customer | Dt_Customer | 0 | 0.000000 |
| Recency | Recency | 0 | 0.000000 |
| MntCoffee | MntCoffee | 205 | 9.151786 |
| MntFruits | MntFruits | 0 | 0.000000 |
| MntMeatProducts | MntMeatProducts | 0 | 0.000000 |
| MntFishProducts | MntFishProducts | 0 | 0.000000 |
| MntSweetProducts | MntSweetProducts | 0 | 0.000000 |
| MntGoldProds | MntGoldProds | 13 | 0.580357 |
| NumWebVisitsMonth | NumWebVisitsMonth | 200 | 8.928571 |
| Complain | Complain | 0 | 0.000000 |
| NumPurchases | NumPurchases | 0 | 0.000000 |
| UsedCampaignOffer | UsedCampaignOffer | 0 | 0.000000 |
Visualizing Feature Relationships and Correlations¶
Correlation Matrix with a heatmap¶
$\rho_{XY} = \frac{\text{cov}(X, Y)}{\sigma_X \sigma_Y}$
- $\rho_{XY}$ is the correlation coefficient between variables $X$ and $Y$.
- $\text{cov}(X, Y)$ represents the covariance between $X$ and $Y$.
- $\sigma_X$ and $\sigma_Y$ are the standard deviations of $X$ and $Y$ respectively.
$\text{cov}(X, Y) = \frac{\sum_{i=1}^{n}(X_i - \bar{X})(Y_i - \bar{Y})}{n-1}$
- $\text{cov}(X, Y)$ is the covariance between variables $X$ and $Y$.
- $X_i$ and $Y_i$ are individual data points for variables $X$ and $Y$.
- $\bar{X}$ and $\bar{Y}$ are the mean values of $X$ and $Y$ respectively.
- $n$ is the number of data points.
$\sigma = \sqrt{\frac{\sum_{i=1}^{n}(X_i - \bar{X})^2}{n-1}}$
- $\sigma$ is the standard deviation.
- $X_i$ represents individual data points.
- $\bar{X}$ is the mean of the data.
- $n$ is the number of data points.
def plot_correlation_heatmap(df):
numeric_df = df.select_dtypes(include=['number'])
plt.figure(figsize=(15, 15))
sns.heatmap(numeric_df.corr(), annot=True, fmt=".3f", cmap="Blues", linewidths=1, square=True)
plt.title('Correlation Matrix Heatmap')
plt.xticks(rotation=45, ha='right')
plt.show()
plot_correlation_heatmap(df)
Features with stronger correlation to the target column (NumPurchases)¶
def select_features_by_correlation(df, target_column='NumPurchases', threshold_low=0.25, threshold_high=0.45):
numeric_df = df.select_dtypes(include=['number'])
target_corr = numeric_df.corr()[target_column].drop(target_column)
target_corr_filtered = target_corr[abs(target_corr) > threshold_low].sort_values(ascending=False)
selected_features = target_corr_filtered[abs(target_corr_filtered) > threshold_high].sort_values(ascending=False)
return selected_features
selected_features = select_features_by_correlation(df, target_column='NumPurchases')
selected_features
MntCoffee 0.715164 Income 0.562603 MntMeatProducts 0.554229 MntGoldProds 0.493939 MntSweetProducts 0.472876 MntFishProducts 0.469454 MntFruits 0.455461 Name: NumPurchases, dtype: float64
The thresholds are set with default values as:
- low : 0.25
- high : 0.45
Number of Observations for Each Unique with a Stronger Correlation.¶
df[selected_features.index].nunique()
MntCoffee 747 Income 1810 MntMeatProducts 558 MntGoldProds 212 MntSweetProducts 177 MntFishProducts 182 MntFruits 158 dtype: int64
fig_width = 20
fig_height = 6
num_cols = 3
num_rows = (len(selected_features.index) + num_cols - 1) // num_cols
# Create a figure with subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(fig_width, fig_height * num_rows))
axes = axes.flatten()
for idx, feature in enumerate(selected_features.index):
ax = axes[idx]
ax.hist(df[feature], edgecolor='white')
ax.set_ylabel(feature)
ax.set_title(f'Distribution of {feature}')
ax.set_xlabel('Value')
for idx in range(len(selected_features.index), len(axes)):
fig.delaxes(axes[idx])
plt.tight_layout()
plt.show()
Visualizing Feature Correlation using scatter and hexbin.¶
Here, the scatter and hexbin diagrams are drawn for the chosen features in the previous part.
With a hexbin diagram, we can see the density of similar data.
sns.set(rc={'figure.figsize': (15, 6)})
num_features = len(selected_features.index)
num_cols = 1
num_rows = (num_features + num_cols - 1) // num_cols
fig, axes = plt.subplots(num_rows, num_cols * 2, figsize=(15, 6 * num_rows)) # Adjust size to fit plots
axes = axes.flatten()
for idx, feature in enumerate(selected_features.index):
# Scatter plot
ax_scatter = axes[2 * idx]
sns.scatterplot(x=feature, y=TARGET_COLUMN, data=df, ax=ax_scatter)
ax_scatter.set_title(f'Scatter Plot')
ax_scatter.set_xlabel(feature)
ax_scatter.set_ylabel(TARGET_COLUMN)
# Hexbin plot
ax_hexbin = axes[2 * idx + 1]
hb = ax_hexbin.hexbin(df[feature], df[TARGET_COLUMN], gridsize=50, cmap='Greens')
cb = fig.colorbar(hb, ax=ax_hexbin)
cb.set_label('Counts')
ax_hexbin.set_title(f'Hexbin Plot')
ax_hexbin.set_xlabel(feature)
ax_hexbin.set_ylabel(TARGET_COLUMN)
for idx in range(num_features * num_cols * 2, len(axes)):
fig.delaxes(axes[idx])
plt.tight_layout()
plt.show()
Other insightful investigations¶
def box_plot(df, selected_features, target_col=TARGET_COLUMN):
for feature in selected_features.index.to_list():
sns.boxplot(x=target_col, y=feature, data=df)
plt.title(f'Box Plot: {feature} vs {target_col}')
plt.show()
sns.set(rc={'figure.figsize': (14, 6)})
box_plot(df, selected_features)
Preprocessing Dataset¶
Some techniques to solve missing values problem¶
Imputation
All the missing values are replaced by a substitution. A substitution could be:- Mean: A simple solution, but outliers can affect it negatively.
- Median: This is often a more robust choice as it is not affected by outliers.
- Mode: Preferred for categorical data where mean and median are not defined.
- Random Fill: Selected values should be between the minimum and maximum in the column.
- Prediction: Missing values are predicted based on the properties of the rows.
Dropping
We can either drop some columns or rows.- Dropping Columns:
- The entire column is removed, which may lead to a loss of valuable data. This method is more justifiable if a large portion of the column is missing values, as there are fewer reliable values to impute missing data with good precision.
- Dropping Rows:
- This is better applied to rows with a majority of missing values to avoid introducing bias or inaccuracies into the dataset. However, this method can result in the loss of potentially important information, so it should be used cautiously.
- Dropping Columns:
Handling Missing Values in Dataframe¶
missing_values(df)
| Missing | Percentage | |
|---|---|---|
| Unnamed: 0 | 0 | 0.000000 |
| ID | 0 | 0.000000 |
| Year_Birth | 0 | 0.000000 |
| Education | 0 | 0.000000 |
| Marital_Status | 0 | 0.000000 |
| Income | 223 | 0.099554 |
| Kidhome | 0 | 0.000000 |
| Teenhome | 0 | 0.000000 |
| Dt_Customer | 0 | 0.000000 |
| Recency | 0 | 0.000000 |
| MntCoffee | 205 | 0.091518 |
| MntFruits | 0 | 0.000000 |
| MntMeatProducts | 0 | 0.000000 |
| MntFishProducts | 0 | 0.000000 |
| MntSweetProducts | 0 | 0.000000 |
| MntGoldProds | 13 | 0.005804 |
| NumWebVisitsMonth | 200 | 0.089286 |
| Complain | 0 | 0.000000 |
| NumPurchases | 0 | 0.000000 |
| UsedCampaignOffer | 0 | 0.000000 |
- Handling Invalid Values
- First, delete invalid values from columns, such as negative values in countable features like
Year_Birth,Income, orKidhome. This step ensures the integrity of the data, as negative values in these contexts are typically nonsensical or indicative of errors. After removing these values, the remaining data can be further processed using imputation or dropping techniques as needed.
- First, delete invalid values from columns, such as negative values in countable features like
pNegCols = ["UsedCampaignOffer", "NumPurchases", "Complain", "NumWebVisitsMonth", "MntGoldProds",
"MntSweetProducts", "MntFishProducts", "MntMeatProducts", "MntFruits", "MntCoffee",
"Recency", "Teenhome", "Kidhome", "Income", "Year_Birth", "ID"]
df[pNegCols] = np.where(df[pNegCols] < 0, np.nan, df[pNegCols])
- Filling in Missing Values
- After handling invalid values, use the
medianto fill in the rest of the missing values. This method is preferred because the median is less affected by outliers and can provide a more accurate representation of the central tendency of the data, especially in the presence of skewed distributions.
- After handling invalid values, use the
def fillna_with_median(df):
df.fillna(df.median(numeric_only=True), inplace=True)
return df
def fill_with_mode(df):
mode_values = df.mode(numeric_only=True).iloc[0]
df.fillna(mode_values, inplace=True)
fillna_with_median(df)
missing_values(df)
| Missing | Percentage | |
|---|---|---|
| Unnamed: 0 | 0 | 0.0 |
| ID | 0 | 0.0 |
| Year_Birth | 0 | 0.0 |
| Education | 0 | 0.0 |
| Marital_Status | 0 | 0.0 |
| Income | 0 | 0.0 |
| Kidhome | 0 | 0.0 |
| Teenhome | 0 | 0.0 |
| Dt_Customer | 0 | 0.0 |
| Recency | 0 | 0.0 |
| MntCoffee | 0 | 0.0 |
| MntFruits | 0 | 0.0 |
| MntMeatProducts | 0 | 0.0 |
| MntFishProducts | 0 | 0.0 |
| MntSweetProducts | 0 | 0.0 |
| MntGoldProds | 0 | 0.0 |
| NumWebVisitsMonth | 0 | 0.0 |
| Complain | 0 | 0.0 |
| NumPurchases | 0 | 0.0 |
| UsedCampaignOffer | 0 | 0.0 |
- Advanced Imputation Techniques
- For rows containing more than a certain number of missing values (e.g., more than two NaN values), consider deleting those rows if they represent a small proportion of the dataset. After removing these rows, apply techniques such as
KNNImputerto fill the remaining missing values. KNNImputer uses the nearest neighbors' values to impute missing data, making it a sophisticated method that leverages the relationships between features.
- For rows containing more than a certain number of missing values (e.g., more than two NaN values), consider deleting those rows if they represent a small proportion of the dataset. After removing these rows, apply techniques such as
Normalization and Standardization in Numerical Features¶
These techniques aim to bring the values of different features onto a similar scale.
- Normalization:
- It scales the values of a feature to a specific range, often between 0 and 1.
- The normalization formula for a numerical feature (X) is given by: $$ Xnorm = \frac {X-Xmin} {Xmax-Xmin} $$
where $X$ is the original value of the feature, $Xmin$ is the minimum value in the feature, and $Xmax$ is the maximum value in the feature.
Normalization is useful when the features have different ranges, and algorithms like neural networks or k-nearest neighbors may perform better when the input features are within a consistent scale.
Standardization:
- transforms the data to have a mean of 0 and a standard deviation of 1. It makes the distribution of each feature more interpretable and facilitates comparisons between different features. $$ Z = \frac {X-Xmin} {Xstd} $$ where $X$ is the original value of the feature, $Xmin$ is the minimum value in the feature, and $Xstd$ is the standard deviation.
- Standardization is particularly useful when the features have different units or when features follow different distributions. It helps algorithms that rely on the assumption of a normal distribution.
def standardization(df, exclude_cols: list = []):
numeric_cols = df.select_dtypes(include="number")
df[numeric_cols.columns] = StandardScaler().fit_transform(numeric_cols)
df[exclude_cols] = numeric_cols[exclude_cols]
return df
def normalization(df, exclude_cols: list = []):
numeric_cols = df.select_dtypes(include="number")
df[numeric_cols.columns] = MinMaxScaler().fit_transform(numeric_cols)
df[exclude_cols] = numeric_cols[exclude_cols]
return df
def plot_histogram(df):
df.hist(bins=20, figsize=(20,15))
plt.show()
Normalizing or standardizing is beneficial as the numerical features in this dataset have different scales.
plot_histogram(df)
df = normalization(df, ['NumPurchases'])
plot_histogram(df)
df = standardization(df, ['NumPurchases'])
plot_histogram(df)
df.describe()
| Unnamed: 0 | ID | Year_Birth | Income | Kidhome | Teenhome | Recency | MntCoffee | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumWebVisitsMonth | Complain | NumPurchases | UsedCampaignOffer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2240.000000 | 2.240000e+03 |
| mean | -1.015061e-16 | -9.198991e-17 | 4.452787e-16 | -3.552714e-16 | 9.992007e-17 | -3.172066e-18 | 1.292617e-16 | -5.709718e-17 | -6.819941e-17 | -8.564578e-17 | -1.015061e-16 | 2.537653e-17 | 9.198991e-17 | 4.440892e-17 | -3.053113e-17 | 14.862054 | 3.806479e-17 |
| std | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 7.677173 | 1.000223e+00 |
| min | -1.731278e+00 | -1.722818e+00 | -6.326960e+00 | -2.053225e+00 | -8.237017e-01 | -9.298944e-01 | -1.696001e+00 | -9.038860e-01 | -6.614492e-01 | -7.398135e-01 | -6.870680e-01 | -6.557331e-01 | -8.449660e-01 | -2.306859e+00 | -9.728167e-02 | 0.000000 | -6.110569e-01 |
| 25% | -8.656389e-01 | -8.514982e-01 | -8.184192e-01 | -6.304878e-01 | -8.237017e-01 | -9.298944e-01 | -8.671566e-01 | -8.204773e-01 | -6.363012e-01 | -6.689119e-01 | -6.321399e-01 | -6.315032e-01 | -6.710752e-01 | -5.939679e-01 | -9.728167e-02 | 8.000000 | -6.110569e-01 |
| 50% | -9.616873e-17 | -4.117757e-02 | 9.967091e-02 | -3.448418e-02 | -8.237017e-01 | -9.298944e-01 | -3.777284e-03 | -3.570960e-01 | -4.602650e-01 | -4.429132e-01 | -4.673554e-01 | -4.618937e-01 | -3.812572e-01 | 2.624776e-01 | -9.728167e-02 | 15.000000 | -6.110569e-01 |
| 75% | 8.656389e-01 | 8.735813e-01 | 6.839101e-01 | 5.852568e-01 | 1.034397e+00 | 9.069340e-01 | 8.596020e-01 | 5.140609e-01 | 1.684356e-01 | 2.882592e-01 | 2.284015e-01 | 1.438543e-01 | 2.370211e-01 | 6.907003e-01 | -9.728167e-02 | 21.000000 | 1.636509e+00 |
| max | 1.731278e+00 | 1.724876e+00 | 2.269702e+00 | 2.535543e+01 | 2.892495e+00 | 2.743762e+00 | 1.722981e+00 | 3.708303e+00 | 4.343008e+00 | 6.904261e+00 | 4.055064e+00 | 5.716737e+00 | 6.149307e+00 | 6.257596e+00 | 1.027943e+01 | 44.000000 | 1.636509e+00 |
Encoding¶
We should encode the categorical features.
One-Hot Encoding:
- Creating a new feature for each category.
- This method is appropriate when the categories do not have an intrinsic order.
- It is particularly useful for algorithms that utilize the distance between data points, such as KNN.
- For example, if we have a feature with 3 categories, we can encode them as follows:
- Category 1: 1, 0, 0
- Category 2: 0, 1, 0
- Category 3: 0, 0, 1
Label Encoding:
- Assigning a numerical label to each category.
- This method is suitable when the categories have an order.
- It can be useful for tree-based models that can leverage the ordinal nature of the data.
- For example, if we have a feature with 3 categories, we can encode them as follows:
- Category 1: 0
- Category 2: 1
- Category 3: 2
Binary Encoding:
- Encoding the categories using binary numbers.
- This method is suitable when the categories do not have an order.
- It reduces the dimensionality compared to One-Hot Encoding.
- For example, if we have a feature with 3 categories, we can encode them as follows:
- Category 1: 00
- Category 2: 01
- Category 3: 10
Frequency Encoding:
- Encoding the categories using the frequency of each category.
- This method assigns a value to each category based on how often it appears in the dataset.
- It is useful when the relative frequency of categories carries meaningful information.
- For example, if we have a feature with 3 categories, we can encode them as follows:
- Category 1: 0.5
- Category 2: 0.25
- Category 3: 0.25
Target Encoding:
- Encoding the categories using the mean of the target variable for each category.
- This method is useful in predictive modeling, especially when there is a relationship between the category and the target variable.
- For example, if we have a feature with 3 categories, we can encode them as follows:
- Category 1: 0.5
- Category 2: 0.25
- Category 3: 0.75
def handle_non_numeric_columns_label_encoding(df):
label_encoder = LabelEncoder()
for col in df.select_dtypes(exclude=['number']).columns:
df[col] = label_encoder.fit_transform(df[col])
return df
df = handle_non_numeric_columns_label_encoding(df)
df.describe()
| Unnamed: 0 | ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntCoffee | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumWebVisitsMonth | Complain | NumPurchases | UsedCampaignOffer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2240.000000 | 2240.000000 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2240.000000 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2.240000e+03 | 2240.000000 | 2.240000e+03 |
| mean | -1.015061e-16 | -9.198991e-17 | 4.452787e-16 | 2.393750 | 3.729911 | -3.552714e-16 | 9.992007e-17 | -3.172066e-18 | 327.875446 | 1.292617e-16 | -5.709718e-17 | -6.819941e-17 | -8.564578e-17 | -1.015061e-16 | 2.537653e-17 | 9.198991e-17 | 4.440892e-17 | -3.053113e-17 | 14.862054 | 3.806479e-17 |
| std | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.124797 | 1.076277 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 190.165575 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 1.000223e+00 | 7.677173 | 1.000223e+00 |
| min | -1.731278e+00 | -1.722818e+00 | -6.326960e+00 | 0.000000 | 0.000000 | -2.053225e+00 | -8.237017e-01 | -9.298944e-01 | 0.000000 | -1.696001e+00 | -9.038860e-01 | -6.614492e-01 | -7.398135e-01 | -6.870680e-01 | -6.557331e-01 | -8.449660e-01 | -2.306859e+00 | -9.728167e-02 | 0.000000 | -6.110569e-01 |
| 25% | -8.656389e-01 | -8.514982e-01 | -8.184192e-01 | 2.000000 | 3.000000 | -6.304878e-01 | -8.237017e-01 | -9.298944e-01 | 163.750000 | -8.671566e-01 | -8.204773e-01 | -6.363012e-01 | -6.689119e-01 | -6.321399e-01 | -6.315032e-01 | -6.710752e-01 | -5.939679e-01 | -9.728167e-02 | 8.000000 | -6.110569e-01 |
| 50% | -9.616873e-17 | -4.117757e-02 | 9.967091e-02 | 2.000000 | 4.000000 | -3.448418e-02 | -8.237017e-01 | -9.298944e-01 | 326.000000 | -3.777284e-03 | -3.570960e-01 | -4.602650e-01 | -4.429132e-01 | -4.673554e-01 | -4.618937e-01 | -3.812572e-01 | 2.624776e-01 | -9.728167e-02 | 15.000000 | -6.110569e-01 |
| 75% | 8.656389e-01 | 8.735813e-01 | 6.839101e-01 | 3.000000 | 5.000000 | 5.852568e-01 | 1.034397e+00 | 9.069340e-01 | 485.000000 | 8.596020e-01 | 5.140609e-01 | 1.684356e-01 | 2.882592e-01 | 2.284015e-01 | 1.438543e-01 | 2.370211e-01 | 6.907003e-01 | -9.728167e-02 | 21.000000 | 1.636509e+00 |
| max | 1.731278e+00 | 1.724876e+00 | 2.269702e+00 | 4.000000 | 7.000000 | 2.535543e+01 | 2.892495e+00 | 2.743762e+00 | 662.000000 | 1.722981e+00 | 3.708303e+00 | 4.343008e+00 | 6.904261e+00 | 4.055064e+00 | 5.716737e+00 | 6.149307e+00 | 6.257596e+00 | 1.027943e+01 | 44.000000 | 1.636509e+00 |
Feasibility of Column Deletion¶
Columns with low correlation to the target variable can be removed because they likely provide little predictive value.
def remove_low_correlation_columns(df, threshold=0.2):
correlation_matrix = df.corr()
correlations_with_target = correlation_matrix['NumPurchases']
correlation_threshold = 0.2
low_correlation_features = correlations_with_target[abs(correlations_with_target) < correlation_threshold].index
df.drop(low_correlation_features, axis=1, inplace=True)
remove_low_correlation_columns(df)
plot_correlation_heatmap(df)
Splitting the dataset into train and test sets¶
Some common percentages for splitting the dataset into train and test sets:¶
- train ratio: $90\%$ , test ration: $10\%$
- train ratio: $80\%$ , test ration: $20\%$
- train ratio: $70\%$ , test ration: $30\%$
- train ratio: $60\%$ , test ration: $40\%$
Some ways to split data into training and test sets:¶
Randomly split the dataset into train and test sets
- This method is the most common and involves shuffling the dataset randomly before dividing it into train and test sets.
- A potential issue with this approach is that the resulting train and test sets may not have the same distribution, mainly if the dataset is not large enough, affecting the model's generalizability.
Split the dataset based on time
- This method is beneficial for time series data, where the order of the data points matters.
- The training set consists of earlier periods, and the test set consists of later periods.
- This approach prevents future and past data leakage, which can occur if data is randomly split without considering time order.
Split the dataset based on the target
- This method is helpful in imbalanced datasets, where certain classes are underrepresented.
- Each class is proportionally represented in both the train and test sets helps the model generalize better across all classes.
- Techniques such as stratified sampling can be used to maintain the distribution of the target variable in both subsets.
some of the most commonly used library methods for random dataset splitting¶
| Method | Library |
|---|---|
| train_test_split | scikit-learn |
| randn | numpy |
| sample | pandas |
def split_data(df: pd.DataFrame, target_column: str = TARGET_COLUMN, train_percent: float = 0.8, random_state: int = 1):
data = df[df.columns.difference([target_column])]
outcome_data = df[target_column]
data_train, data_test, outcome_train, outcome_test = train_test_split(
data, outcome_data, train_size=train_percent, random_state=random_state)
return data_train, data_test, outcome_train, outcome_test
x_train, x_test, y_train, y_test = split_data(df)
The random_state is used as a seed for a random number generator and makes the dataset reproducible.
The used ration is as bellow:
- train ratio: $80\%$ , test ration: $20\%$
Validation Set¶
A validation set is utilized to assess the performance of a model during training and to prevent overfitting.
This set is crucial for tuning hyperparameters and making adjustments to improve the model’s performance, ensuring that the model does not just perform well on the training data but also generalizes effectively to new, unseen data.
K-Fold Cross Validation¶
It is a technique to partition a dataset into $k$ equally sized folds.
The model is trained and evaluated $k$ times, each using a different fold as the test set and the remaining $k-1$ folds as the training set.
This process helps assess the model's performance across various subsets of the data, ensuring that the model's performance is consistent across different subsets of the data.
1. Partitioning the Data:
- The dataset is divided into $k$ equally sized folds.
- Each fold serves as a test set while the remaining $k-1$ folds are used for training.
2. Training and Evaluation:
- The model is trained on the training set (comprising $k-1$ folds).
- The trained model is then evaluated on the test set (the remaining fold).
3. Iteration:
- Steps 1 and 2 are repeated $k$ times, with each of the folds being used exactly once as the test set.
4. Performance Metrics:
- The performance metrics (e.g., accuracy, precision, recall) from each iteration are
averagedto provide a more robust evaluation of the model.
This technique is also commonly used for hyperparameter tuning, aiding in the selection of the optimal set of hyperparameters that generalize well to various data samples.
Linear Regression¶
Main form of simple linear regression function: $$f(x) = \alpha x + \beta$$
here we want to find the bias ($\alpha$) and slope($\beta$) by minimizing the derivation of the Residual Sum of Squares (RSS) function:
- step 1: Compute RSS of the training data
$$ RSS = \Sigma (y_i - (\hat{\beta} + \hat{\alpha} * x_i) )^2 $$
- step 2: Compute the derivatives of the RSS function in terms of $\alpha$ and $\beta$, and set them equal to 0 to find the desired parameters
$$ \frac{\partial RSS}{\partial \beta} = \Sigma (-f(x_i) + \hat{\beta} + \hat{\alpha} * x_i) = 0$$ $$ \to \beta = \hat{y} - \hat{\alpha} \hat{x} \to (1)$$
$$ \frac{\partial RSS}{\partial \alpha} = \Sigma (-2 x_i y_i + 2 \hat{\beta} x_i + 2\hat{\alpha} x_i ^ 2) = 0 \to (2)$$
$$ (1) , (2) \to \hat{\alpha} = \frac{\Sigma{(x_i - \hat{x})(y_i - \hat{y})}}{\Sigma{(x_i - \hat{x})^2}} $$ $$ \hat{\beta} = y - \hat{a} x$$
Using the formula provided earlier, the following function is implemented to calculate the parameters of a simple linear regression model.
$\beta$ = $\frac{\sum_{i=1}^{n} (x_iy_i) - \frac{1}{n}\sum_{i=1}^{n}x_i\sum_{i=1}^{n}y_i}{\sum_{i=1}^{n}x_i^2 - \frac{1}{n}(\sum_{i=1}^{n}x_i)^2}$
$\alpha$ = $\frac{1}{n}\sum_{i=1}^{n}y_i - \beta \frac{1}{n}\sum_{i=1}^{n}x_i$
selected_features = select_features_by_correlation(df, TARGET_COLUMN, 0.25, 0.477)
display(selected_features)
MntCoffee 0.678082 MntMeatProducts 0.554229 Income 0.535685 MntGoldProds 0.490752 Name: NumPurchases, dtype: float64
Features with a higher correlation to the target column are recommended for use in Linear Regression. This is because features with stronger correlations are typically more predictive, providing a more reliable basis for the model.
selected_features.describe()
count 4.000000 mean 0.564687 std 0.080157 min 0.490752 25% 0.524452 50% 0.544957 75% 0.585192 max 0.678082 Name: NumPurchases, dtype: float64
def simple_linear_regression(input_feature, output):
sum_x = np.sum(input_feature)
sum_y = np.sum(output)
sum_xy = np.sum(input_feature * output)
sum_xx = np.sum(input_feature * input_feature)
slope = (sum_xy - (sum_x * sum_y) / len(input_feature)) / (sum_xx - (sum_x * sum_x) / len(input_feature))
intercept = sum_y / len(input_feature) - slope * sum_x / len(input_feature)
return (intercept, slope)
The function bellow predicts values for the given data using the calculated intercept and slope. The prediction is based on the formula:
$\hat{y}$ = $\beta_0$ + $\beta_1 x$
def get_regression_predictions(input_feature, bias, slope):
predicted_values = bias + slope * input_feature
return predicted_values
For model evaluation, Root Mean Square Error (RMSE) is used.
RMSE is the square root of the mean of the squared differences between the residuals, and the residuals are just a fancy word for the difference between the predicted output and the true output.
$RMSE$ = $\sqrt{\frac{1}{n}\sum_{i=1}^{n}(y_i - \hat{y}_i)^2}$
def get_root_mean_square_error(predicted_values, outputs):
squared_diffs = [(predicted - actual)**2 for predicted, actual in zip(predicted_values, outputs)]
mean_squared_diff = sum(squared_diffs) / len(squared_diffs)
rmse = mean_squared_diff**0.5
return rmse
The RMSE has no bounds. Thus, it becomes challenging to determine whether a particular RMSE value is considered good or bad without any reference point.
Instead, we use the R2 score. The R2 score is calculated by comparing the sum of the squared differences between the actual and predicted values of the dependent variable to the total sum of squared differences between the actual and mean values of the dependent variable.
The R2 score is formulated as below:
$$R^2 = 1 - \frac{SSres}{SStot} = 1 - \frac{\sum_{i=1}^{n} (y_{i,true} - y_{i,pred})^2}{\sum_{i=1}^{n} (y_{i,true} - \bar{y}_{true})^2} $$
def get_r2_score(predicted_values, outputs):
residuals = outputs - predicted_values
SSres = np.sum(residuals ** 2)
SStot = np.sum((outputs - np.mean(outputs)) ** 2)
R2_score = 1 - SSres / SStot
return R2_score
Now calculate the fitness of the model.
$\beta$ = $\frac{\sum_{i=1}^{n} (x_iy_i) - \frac{1}{n}\sum_{i=1}^{n}x_i\sum_{i=1}^{n}y_i}{\sum_{i=1}^{n}x_i^2 - \frac{1}{n}(\sum_{i=1}^{n}x_i)^2}$
$\alpha = \frac{1}{n}\sum_{i=1}^{n}y_i - \beta \frac{1}{n}\sum_{i=1}^{n}x_i$
$\hat{y}$ = $4\alpha + \beta x$
$RMSE$ = $\sqrt{\frac{1}{n}\sum_{i=1}^{n}(y_i - \hat{y}_i)^2}$
$R2$ = $1 - \frac{\sum_{i=1}^{n}(y_i - \hat{y}_i)^2}{\sum_{i=1}^{n}(y_i - \bar{y})^2}$
def split_data(df: pd.DataFrame, target_column: str = TARGET_COLUMN, train_percent: float = 0.8, random_state: int = 1):
data = df[df.columns.difference([target_column])]
outcome_data = df[target_column]
data_train, data_test, outcome_train, outcome_test = train_test_split(
data, outcome_data, train_size=train_percent, random_state=random_state)
return data_train, data_test, outcome_train, outcome_test
x_train, x_test, y_train, y_test = split_data(df)
def log_parameters(feature, RMSE, R2_score, intercept, slope):
print(f"Feature :{feature}")
print(f"RMSE :{RMSE:0.3f}")
print(f"R2 Score :{R2_score:0.3f}")
print(f"y = {intercept:0.3f} * x + {slope:0.3f}")
print("--------------------------------------------")
def plot_regression_line(x_test, y_test, feature, predicted_values):
plt.figure(figsize=(6, 4))
plt.scatter(x_test[feature], y_test, label="Actual Data")
plt.plot(x_test[feature], predicted_values, color='red', label="Regression Line")
plt.title(f"Regression Line for {feature}")
plt.xlabel(feature)
plt.ylabel(TARGET_COLUMN)
plt.legend()
plt.show()
for feature in selected_features.index:
intercept, slope = simple_linear_regression(x_train[feature], y_train)
predicted_values = get_regression_predictions(x_test[feature], intercept, slope)
plot_regression_line(x_test, y_test, feature, predicted_values)
RMSE = get_root_mean_square_error(predicted_values, y_test)
R2_score = get_r2_score(predicted_values, y_test)
log_parameters(feature, RMSE, R2_score, intercept, slope)
Feature :MntCoffee RMSE :5.502 R2 Score :0.439 y = 14.864 * x + 5.264 --------------------------------------------
Feature :MntMeatProducts RMSE :6.097 R2 Score :0.311 y = 14.889 * x + 4.238 --------------------------------------------
Feature :Income RMSE :5.720 R2 Score :0.393 y = 14.864 * x + 3.812 --------------------------------------------
Feature :MntGoldProds RMSE :6.257 R2 Score :0.274 y = 14.943 * x + 3.759 --------------------------------------------
Multiple Regression¶
Multiple regression is a statistical technique that aims to model the relationship between a dependent variable and two or more independent variables.
Multiple regression with n independent variables is expressed as follows:
$$f(x) = \beta _{0} + \beta_{1} x_{1} + \beta_{2} x_{2} + \beta_{3} x_{3} + \beta_{4} x_{4} + ... + \beta_{n} x_{n} + c $$
To optimize the model for accurate predictions, multiple regression commonly employs iterative algorithms such as gradient descent.
The main goal of the optimization process is to make our predictions as close as possible to the actual values.
We measure the prediction error using a cost function, usually denoted as $J(\beta)$.
$$ J(\beta)= \frac {1}{2m} Σ_{i=0}^{m-1}(y_i - (\hat \beta _{0} + \hat \beta_{1} x_{1} + \hat \beta_{2} x_{2} + \hat \beta_{3} x_{3} + \hat \beta_{4} x_{4} + ... + \hat \beta_{n} x_{n}) )^2 $$
Gradient descent iteratively adjusts the coefficients $(\beta_i)$ to minimize the cost function. The update rule for each coefficient is:
$$\beta_{i} = \beta _ {i} - \alpha \frac {∂J(\beta)}{∂\beta_{i}}$$
$$ \frac {∂J(\beta)}{∂\beta_{i}} = \frac {1}{m}Σ_{j=0}^{m-1}(y_j - (\hat \beta _{0} + \hat \beta_{1} x_{j1} + \hat \beta_{2} x_{j2} + \hat \beta_{3} x_{j3} + \hat \beta_{4} x_{j4} + ... + \hat \beta_{n} x_{jn})) x_{ji} $$
def predict_output(feature_matrix, weights, bias):
predictions = np.dot(feature_matrix, weights)
predictions += bias
return predictions
Derivative computation
As we saw, the cost function is the sum over the data points of the squared difference between an observed output and a predicted output.
Since the derivative of a sum is the sum of the derivatives, we can compute the derivative for a single data point and then sum over data points. We can write the squared difference between the observed output and predicted output for a single point as follows:
$$ (output - (const* w _{0} + [feature_1] * w_{1} + ...+ [feature_n] * w_{n} ))^2 $$
With n feautures and a const , So the derivative will be :
$$ 2 * (output - (const* w _{0} + [feature_1] * w_{1} + ...+ [feature_n] * w_{n} )) $$
The term inside the paranethesis is just the error (difference between prediction and output). So we can re-write this as:
$$2 * error*[feature_i] $$
That is, the derivative for the weight for feature i is the sum (over data points) of 2 times the product of the error and the feature itself. In the case of the constant then this is just twice the sum of the errors!
Recall that twice the sum of the product of two vectors is just twice the dot product of the two vectors. Therefore the derivative for the weight for feature_i is just two times the dot product between the values of feature_i and the current errors.
def feature_derivative(errors, feature):
derivative = 2 * np.dot(errors, feature)
return derivative
Gradient Descent
Given a starting point we update the current weights by moving in the negative gradient direction. Recall that the gradient is the direction of increase and therefore the negative gradient is the direction of decrease and we're trying to minimize a cost function.
The amount by which we move in the negative gradient direction is called the step size. We stop when we are sufficiently close to the optimum. We define this by requiring that the magnitude (length) of the gradient vector to be smaller than a fixed tolerance.
def regression_gradient_descent(feature_matrix, outputs, initial_weights, bias, step_size, tolerance):
weights = np.array(initial_weights)
converged = False
while not converged:
predictions = predict_output(feature_matrix, weights, bias)
errors = outputs - predictions
gradient = - feature_derivative(feature_matrix.T, errors)
weights -= step_size * gradient
bias_gradient = -2 * np.sum(errors)
bias -= step_size * bias_gradient
if np.linalg.norm(gradient) < tolerance:
converged = True
return weights, bias
def normalize_features(chosen_features, data_frame):
for feature in chosen_features:
data_frame.loc[:, feature] = (data_frame[feature] - data_frame[feature].mean()) / data_frame[feature].std()
return data_frame
def n_feature_regression(chosen_feature_matrix, target_matrix, keywords):
initial_weights = keywords['initial_weights']
step_size = keywords['step_size']
tolerance = keywords['tolerance']
bias = keywords['bias']
weights, bias = regression_gradient_descent(chosen_feature_matrix, target_matrix, initial_weights, bias, step_size,
tolerance)
return weights, bias
def get_weights_and_bias(chosen_features):
"""
Computes the weights and bias for a general n feature model.
:param chosen_features: list of features to perform multiple regression on
:return: chosen_feature_matrix, computed weights and bias via regression
"""
keywords = {
'initial_weights': np.array([.5]*len(chosen_features)),
'step_size': 1.e-4,
'tolerance': 1.e-10,
'bias': 0
}
chosen_feature_dataframe = x_train[chosen_features]
chosen_feature_matrix = chosen_feature_dataframe.to_numpy()
target_column = y_train
target_matrix = target_column.to_numpy()
train_weights, bias = n_feature_regression(chosen_feature_matrix, target_matrix, keywords)
return chosen_feature_matrix, train_weights, bias
Two Feature Regression
In this part, we choose two features and implement multiple regression.
chosen_features = selected_features.index[:2]
chosen_feature_matrix, train_weights, bias = get_weights_and_bias(chosen_features)
predictions = predict_output(x_test[chosen_features], train_weights, bias)
R2_score = get_r2_score(predictions, y_test)
RMSE = get_root_mean_square_error(predictions, y_test)
print("RMSE :", RMSE)
print("R2 Score :", R2_score)
print("--------------------------------------------")
RMSE : 5.374576615233149 R2 Score : 0.4643213845425165 --------------------------------------------
Three Feature Regression
chosen_features = selected_features.index[:3]
chosen_feature_matrix, train_weights, bias = get_weights_and_bias(chosen_features)
predictions = predict_output(x_test[chosen_features], train_weights, bias)
R2_score = get_r2_score(predictions, y_test)
RMSE = get_root_mean_square_error(predictions, y_test)
print("RMSE :", RMSE)
print("R2 Score :", R2_score)
print("--------------------------------------------")
RMSE : 5.199870341008263 R2 Score : 0.49858095852511164 --------------------------------------------
Five Feature Regression
chosen_features = selected_features.index[:5]
chosen_feature_matrix, train_weights, bias = get_weights_and_bias(chosen_features)
predictions = predict_output(x_test[chosen_features], train_weights, bias)
R2_score = get_r2_score(predictions, y_test)
RMSE = get_root_mean_square_error(predictions, y_test)
print("RMSE :", RMSE)
print("R2 Score :", R2_score)
print("--------------------------------------------")
RMSE : 5.0354824739760895 R2 Score : 0.529783385182038 --------------------------------------------
Classification¶
Here, I created a new column named PurchaseRate based on NumPurchases.
df['PurchaseRate'] = np.where(df['NumPurchases'] > df['NumPurchases'].median(), 'HIGH', 'LOW')
median_num_purchases = df['NumPurchases'].median()
df.drop(columns=['NumPurchases'], inplace=True)
TARGET_COLUMN = 'PurchaseRate'
df
| Income | Kidhome | MntCoffee | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumWebVisitsMonth | UsedCampaignOffer | PurchaseRate | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.244835 | -0.823702 | 1.057762 | 1.551577 | 1.679702 | 2.462147 | 1.476500 | 0.855299 | 0.262478 | 1.636509 | HIGH |
| 1 | -0.241838 | 1.034397 | -0.357096 | -0.636301 | -0.713225 | -0.650449 | -0.631503 | -0.729039 | -0.165745 | -0.611057 | LOW |
| 2 | 0.800874 | -0.823702 | -0.357096 | 0.570804 | -0.177032 | 1.345274 | -0.146905 | -0.033476 | 0.262478 | -0.611057 | HIGH |
| 3 | -1.054666 | 1.034397 | -0.869905 | -0.560857 | -0.651187 | -0.503974 | -0.583043 | -0.748360 | 0.262478 | -0.611057 | LOW |
| 4 | 0.251231 | 1.034397 | -0.369453 | 0.419916 | -0.216914 | 0.155164 | -0.001525 | -0.555148 | -0.165745 | -0.611057 | HIGH |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2235 | 0.372136 | -0.823702 | 1.286363 | 0.419916 | 0.066692 | 0.081926 | 2.203398 | 3.927370 | -0.165745 | -0.611057 | HIGH |
| 2236 | 0.487305 | 2.892495 | 0.350333 | -0.661449 | -0.606873 | -0.687068 | -0.655733 | -0.690396 | 0.690700 | 1.636509 | HIGH |
| 2237 | 0.197092 | -0.823702 | 1.901116 | 0.545656 | 0.221789 | -0.101168 | -0.364974 | -0.381257 | 0.262478 | 1.636509 | HIGH |
| 2238 | 0.703160 | -0.823702 | 0.418295 | 0.092992 | 0.208495 | 0.777683 | 0.071165 | 0.333627 | 0.262478 | -0.611057 | HIGH |
| 2239 | 0.027413 | 1.034397 | -0.644392 | -0.586005 | -0.469501 | -0.650449 | -0.631503 | -0.439221 | 0.690700 | 1.636509 | LOW |
2240 rows × 11 columns
df['PurchaseRate'].replace({'LOW': 0, 'HIGH': 1}, inplace=True)
C:\Users\ASUS\AppData\Local\Temp\ipykernel_22524\1709526377.py:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
df['PurchaseRate'].replace({'LOW': 0, 'HIGH': 1}, inplace=True)
C:\Users\ASUS\AppData\Local\Temp\ipykernel_22524\1709526377.py:1: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
df['PurchaseRate'].replace({'LOW': 0, 'HIGH': 1}, inplace=True)
df
| Income | Kidhome | MntCoffee | MntFruits | MntMeatProducts | MntFishProducts | MntSweetProducts | MntGoldProds | NumWebVisitsMonth | UsedCampaignOffer | PurchaseRate | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.244835 | -0.823702 | 1.057762 | 1.551577 | 1.679702 | 2.462147 | 1.476500 | 0.855299 | 0.262478 | 1.636509 | 1 |
| 1 | -0.241838 | 1.034397 | -0.357096 | -0.636301 | -0.713225 | -0.650449 | -0.631503 | -0.729039 | -0.165745 | -0.611057 | 0 |
| 2 | 0.800874 | -0.823702 | -0.357096 | 0.570804 | -0.177032 | 1.345274 | -0.146905 | -0.033476 | 0.262478 | -0.611057 | 1 |
| 3 | -1.054666 | 1.034397 | -0.869905 | -0.560857 | -0.651187 | -0.503974 | -0.583043 | -0.748360 | 0.262478 | -0.611057 | 0 |
| 4 | 0.251231 | 1.034397 | -0.369453 | 0.419916 | -0.216914 | 0.155164 | -0.001525 | -0.555148 | -0.165745 | -0.611057 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2235 | 0.372136 | -0.823702 | 1.286363 | 0.419916 | 0.066692 | 0.081926 | 2.203398 | 3.927370 | -0.165745 | -0.611057 | 1 |
| 2236 | 0.487305 | 2.892495 | 0.350333 | -0.661449 | -0.606873 | -0.687068 | -0.655733 | -0.690396 | 0.690700 | 1.636509 | 1 |
| 2237 | 0.197092 | -0.823702 | 1.901116 | 0.545656 | 0.221789 | -0.101168 | -0.364974 | -0.381257 | 0.262478 | 1.636509 | 1 |
| 2238 | 0.703160 | -0.823702 | 0.418295 | 0.092992 | 0.208495 | 0.777683 | 0.071165 | 0.333627 | 0.262478 | -0.611057 | 1 |
| 2239 | 0.027413 | 1.034397 | -0.644392 | -0.586005 | -0.469501 | -0.650449 | -0.631503 | -0.439221 | 0.690700 | 1.636509 | 0 |
2240 rows × 11 columns
splitting data to test and train¶
def split_data(dataframe: pd.DataFrame, outcome: str, train_percent: float = 0.7):
ddata = dataframe.drop(columns=[outcome])
odata = dataframe[outcome]
split_data = train_test_split(ddata, odata, train_size=train_percent, random_state=1)
dtrain, dtest, otrain, otest = split_data
return dtrain, dtest, otrain, otest
x_train, x_test, y_train, y_test = split_data(df, 'PurchaseRate')
class Classifier:
def __init__(self, model, x_train, y_train, x_test, y_test, params=None):
self.model = model
self.x_train = x_train
self.y_train = y_train
self.x_test = x_test
self.y_test = y_test
self.params = params if params else {}
self.set_params()
def set_params(self):
self.model_instance = self.model(**self.params)
self.model_instance.fit(self.x_train, self.y_train)
self.predictions = self.model_instance.predict(self.x_test)
def accuracy_test(self) -> float:
return metrics.accuracy_score(self.y_test, self.predictions)
def accuracy_train(self) -> float:
train_predict = self.model_instance.predict(self.x_train)
return metrics.accuracy_score(self.y_train, train_predict)
def confusion_matrix(self):
matrix = metrics.confusion_matrix(self.y_test, self.predictions)
matrix_disp = metrics.ConfusionMatrixDisplay(matrix)
matrix_disp.plot(cmap='Blues')
plt.grid(False)
plt.title(f'{self.model.__name__} Confusion Matrix')
plt.show()
def log_grid_result(self, grid):
print(f"- Best hyperparameters : {grid.best_params_}")
print(f"- Best model's train score (accuracy): {grid.best_score_:0.3f}")
def grid_search(self, search_params, scoring='accuracy') -> tuple[float, GridSearchCV]:
grid = GridSearchCV(self.model_instance, search_params, scoring=scoring)
grid.fit(self.x_train, self.y_train)
test_score = grid.score(self.x_test, self.y_test)
print(f"- model's test accuracy : {self.accuracy_test():0.3f}")
print(f"- Test Score(accuracy) : {test_score:0.3f}")
self.log_grid_result(grid)
print(f"- Best model's test score : {test_score:0.3f}")
# return test_score, grid
def predict(self, x):
return self.model.predict(x)
Decision Tree¶
A decision tree is a versatile tool in machine learning, working well for sorting things into groups or guessing values. It chops data into smaller chunks based on different traits, trying to keep things similar within each chunk.
It starts by looking at all the data and picks a trait that splits it into two groups that are as much alike as possible for the thing we care about. It keeps doing this for each smaller group until it hits specific rules, like how detailed the tree can get or how many examples are in each group.
For sorting things, it tries to find traits that give the most useful info or reduce the messiness in the groups. For guessing values, it looks for traits that get our guesses as close as possible to the real answers.
Once the tree's made, it can guess things about new data by following its branches from start to finish. Where it ends tells us our prediction. For sorting things, it might go with the most common group, and for guessing numbers, it could be an average.
These trees are great because they're simple to understand and display visually. They handle different types of data, like categories or numbers. But sometimes, they can get too detailed or struggle if there's noisy or extra stuff in the data. People have ways to fix this, like simplifying the tree or using groups of trees together to make better decisions.
| Hyper-parameter | Description |
|---|---|
| max_depth | The maximum depth of the decision tree. A larger value of max_depth can capture more complex patterns in the data, but may also lead to overfitting. A smaller value of max_depth may lead to underfitting. |
| min_samples_split | The minimum number of samples required to split an internal node. |
| min_samples_leaf | The minimum number of samples required to be at a leaf node. |
| criterion | The function used to measure the quality of a split. The two options available are gini and entropy. |
| splitter | The strategy used to choose the split at each node. The two options here are random and best. |
grid_s_params = {
"criterion": ["gini", "entropy"],
"splitter": ["best", "random"],
"max_depth": range(2, 9),
"min_samples_split": range(2, 9),
"min_samples_leaf": range(2, 9),
"random_state": [54],
}
dtree_model = Classifier(DecisionTreeClassifier, x_train, y_train, x_test, y_test)
dtree_model.grid_search(grid_s_params)
- model's test accuracy : 0.891
- Test Score(accuracy) : 0.914
- Best hyperparameters : {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 2, 'min_samples_split': 2, 'random_state': 54, 'splitter': 'best'}
- Best model's train score (accuracy): 0.919
- Best model's test score : 0.914
KNN¶
K-Nearest Neighbors(KNN), is a straightforward and efficient method used for sorting things into categories or guessing values.
It finds the closest K neighbors from the training data to a new item and decides its category or value based on either voting or averaging those neighbors' characteristics. KNN doesn't make any strict assumptions about how the data is spread out.
Choosing the right K value is crucial. If K is large, it can help smooth out any irregularities in the data but might make the model too simple. If K is small, it can capture intricate details but might overcomplicate the model.
Some settings include:
- algorithm:
- This specifies how to find those nearest neighbors.
- The options include auto, ball_tree, kd_tree, and brute.
- n_neighbors:
- It's the number of neighbors considered when making a decision.
- Higher values reduce noise but might oversimplify, while lower values can catch complex details but risk overfitting.
- metric:
- It's the measure of distance used to find these neighbors in the data.
grid_s_params = {
"n_neighbors": range(2,20),
"metric": ["euclidean", "manhattan", "minkowski"]
}
knear_model = Classifier(KNeighborsClassifier, x_train, y_train, x_test, y_test)
knear_model.grid_search(grid_s_params)
- model's test accuracy : 0.887
- Test Score(accuracy) : 0.890
- Best hyperparameters : {'metric': 'euclidean', 'n_neighbors': 3}
- Best model's train score (accuracy): 0.913
- Best model's test score : 0.890
Logistic Regression¶
Logistic Regression serves as a supervised learning technique primarily employed for classification tasks. It operates by modeling the likelihood of a binary outcome (like 0 or 1) based on the input features. This model produces a probability score between 0 and 1, representing the chance of the binary outcome occurring. Logistic regression, being parametric, makes certain assumptions about the data's distribution. It's versatile, handling both categorical and numerical data.
The logistic regression equation is:
$$(P(y=1|X)) = \sigma(z) = \frac{1}{1 + e^{-z}}$$
where the z is: $$z = w_1x_1 + w_2x_2 + ... + w_nx_n + b$$ where each $w_i$ is the weight associated with the $i^{th}$ feature.
The model computes optimal coefficients that minimize the difference between predicted probabilities and actual labels in the training data. Predictions involve calculating the probability and applying a decision threshold.
| Hyperparameter | Description |
|---|---|
| Solver | Chooses the optimization algorithm for coefficient optimization. Common solvers include lbfgs, liblinear, newton-cg, sag, and saga. |
| Penalty | Dictates L1 or L2 regularization to curb overfitting. It picks between the two. L1 can aid feature selection by zeroing some coefficients, while L2 shrinks coefficients towards zero. |
| C | Governs the regularization strength. A lower C means stronger regularization, preventing overfitting but potentially causing underfitting. |
grid_s_params = {
'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
"penalty": ["l2"],
"solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
}
logreg_model = Classifier(LogisticRegression, x_train, y_train, x_test, y_test)
logreg_model.grid_search(grid_s_params)
- model's test accuracy : 0.881
- Test Score(accuracy) : 0.879
- Best hyperparameters : {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
- Best model's train score (accuracy): 0.897
- Best model's test score : 0.879
Confusion Matrix¶
dtree_model.confusion_matrix()
knear_model.confusion_matrix()
logreg_model.confusion_matrix()
Perform randomized search for each model¶
dt_model = DecisionTreeClassifier(
max_depth=4, min_samples_split=8,
min_samples_leaf=2, random_state=54,
splitter='best', criterion='gini'
)
dt_model.fit(x_train, y_train)
y_pred_dt = dt_model.predict(x_test)
# K-Nearest Neighbors Classifier
knn_model = KNeighborsClassifier(n_neighbors=9, metric='euclidean')
# weights='uniform', algorithm='kd_tree')
knn_model.fit(x_train, y_train)
y_pred_knn = knn_model.predict(x_test)
# Logistic Regression Classifier
logreg_model = LogisticRegression(C=1, penalty='l2', solver='newton-cg')
# max_iter=2000, random_state=42)
logreg_model.fit(x_train, y_train)
y_pred_logreg = logreg_model.predict(x_test)
models = {'Decision Tree': (dt_model, y_pred_dt),
'KNN': (knn_model, y_pred_knn),
'Logistic Regression': (logreg_model, y_pred_logreg)}
def plot_model_evaluation(models, X_test, y_test):
fig, axes = plt.subplots(nrows=2, ncols=len(models), figsize=(20, 10))
plt.subplots_adjust(hspace=0.5)
for i, (model_name, (model, y_pred)) in enumerate(models.items()):
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
labels = ['Low', 'High']
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0, i], xticklabels=labels, yticklabels=labels)
axes[0, i].set_title(f'Confusion Matrix - {model_name}')
# Classification Report
classification_rep = classification_report(y_test, y_pred, output_dict=True)
sns.heatmap(pd.DataFrame(classification_rep).iloc[:-1, :].T, annot=True, cmap='Blues', ax=axes[1, i])
axes[1, i].set_title(f'Classification Report - {model_name}')
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
axes[1, i].text(0.5, -0.2, f'Accuracy: {accuracy:.2%}', horizontalalignment='center',
verticalalignment='center', transform=axes[1, i].transAxes)
plt.show()
plot_model_evaluation(models, x_test, y_test)
def hyper_param_comp(param_range, init_params, param):
test_accs = []
train_accs = []
for i in param_range:
init_params[param] = i
classifier = RandomForestClassifier(criterion=init_params['criterion'], max_depth=init_params['max_depth'], n_estimators=init_params['n_estimators'])
RF = Classifier(classifier, grid_search_params, X_train, y_train)
test_accs.append(RF.calc_accuracy(X_test, y_test))
train_accs.append(RF.train_accuracy)
plt.plot(param_range, test_accs, color='blue', label='test')
plt.plot(param_range, train_accs, color='red', label='train')
plt.xlabel(param)
plt.ylabel('Accuracy')
plt.legend(loc="lower right")
GridSearchCV¶
GridSearchCV is a technique used in machine learning to fine-tune hyperparameters effectively, aiming to find the best combination for a model. It involves specifying various hyperparameters and testing different combinations to determine the optimal set.
| Key Parameter | Description |
|---|---|
| estimator | Represents the model under consideration for parameter tuning. |
| param_grid | A dictionary or list of dictionaries that outlines the hyperparameters and their potential values to explore. |
| cv | Denotes the cross-validation strategy, determining how the dataset is split into training and validation sets. |
| scoring | Evaluates and scores the model's performance against the validation set. |
| n_jobs | Specifies the number of CPU cores used for parallel processing. Using n_jobs=-1 utilizes all available CPU cores for faster computation. |
| verbose | Controls the level of detail in the output during the search process. Setting verbose=1 displays progress messages during the hyperparameter search. |
dtree_grid_s_params = {
"criterion": ["gini", "entropy"],
"splitter": ["best", "random"],
"max_depth": range(2, 9),
"min_samples_split": range(2, 9),
"min_samples_leaf": range(2, 9),
"random_state": [54],
}
print(f'Decision Tree')
dtree_model.grid_search(dtree_grid_s_params)
knear_grid_s_params = {
"n_neighbors": range(2,20),
"metric": ["euclidean", "manhattan", "minkowski"]
}
print(f'K Nearest Neighbors')
grid_res = knear_model.grid_search(knear_grid_s_params)
# print(f'Logistic reg model')
# grid_s_params = {
# 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
# "penalty": ["l2"],
# "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
# }
# grid_res = logreg_model.grid_search()
Decision Tree
- model's test accuracy : 0.891
- Test Score(accuracy) : 0.914
- Best hyperparameters : {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 2, 'min_samples_split': 2, 'random_state': 54, 'splitter': 'best'}
- Best model's train score (accuracy): 0.919
- Best model's test score : 0.914
K Nearest Neighbors
- model's test accuracy : 0.887
- Test Score(accuracy) : 0.890
- Best hyperparameters : {'metric': 'euclidean', 'n_neighbors': 3}
- Best model's train score (accuracy): 0.913
- Best model's test score : 0.890
dtree_params = {'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 8, 'min_samples_split': 2, 'random_state': 1}
new_dtree_classifier = Classifier(DecisionTreeClassifier, x_train, y_train, x_test, y_test, dtree_params)
print(f'Decision Tree test: {new_dtree_classifier.accuracy_test() * 100:.3f}%')
print(f'Decision Tree train: {new_dtree_classifier.accuracy_train() * 100:.3f}%')
Decision Tree test: 91.220% Decision Tree train: 92.921%
knear_params = {'n_neighbors': 9}
new_knear_classifier = Classifier(KNeighborsClassifier, x_train, y_train, x_test, y_test, knear_params)
print(f'knn test: {new_knear_classifier.accuracy_test() * 100:.3f}%')
print(f'knn train: {new_knear_classifier.accuracy_train() * 100:.3f}%')
knn test: 88.542% knn train: 92.283%
Underfitting and Overfitting¶
Some key consepts:
- Bias
- Bias refers to how well a model captures the intricacies of the training data.
- Higher bias indicates less precision in capturing complex patterns, as it might not consider all features, including noisy data.
- Lower bias suggests better accuracy on the training data, but it might lead to overfitting if the model starts capturing noise.
- Ideally, you want to balance bias to prevent both underfitting and overfitting.
- Bias refers to how well a model captures the intricacies of the training data.
- Variance
- Variance measures the model's performance on new, unseen data.
- Lower variance indicates that the model generalizes well and accurately identifies patterns in the testing dataset.
- High variance, however, implies that the model may be overfitting, capturing noise in the training data rather than general patterns.
- Variance measures the model's performance on new, unseen data.
- Trade-off Between Bias and Variance
- Achieving both low bias and low variance simultaneously is challenging.
- Reducing one typically increases the other. It's crucial to find a balance to optimize model performance.
Overfitting
- Overfitting occurs when a model becomes overly complex, fitting the training data too closely.
- It captures noise instead of the actual patterns, leading to poor performance on new, unseen data.
- Factors contributing to overfitting include too many features, excessive model complexity, or prolonged training.
- To detect overfitting, compare performance on training and validation data.
- A model that excels on training data but performs poorly on validation data is likely overfitting.
| Technique | Description |
|---|---|
| Regularization | Adds a penalty to the loss function to discourage overly large weights. |
| Early Stopping | Halts training when the model's performance on validation data plateaus. |
| Model Simplification | Reduces complexity to prevent overfitting. |
Underfitting
- Underfitting happens when a model is too simplistic to capture underlying patterns in the data.
- It results in poor performance on both training and new data.
- Causes of underfitting include too few features, excessive simplicity, or insufficient training.
- To identify underfitting, assess the model's performance on both training and validation data.
- A model struggling with both is likely underfitting.
| Technique | Description |
|---|---|
| Add Features | Includes more relevant features to capture data patterns. |
| Increase Model Complexity | Uses more complex models that better capture the data. |
| Extend Training Duration | Allows more time for the model to learn from the data. |
Finding the right balance between model complexity and available data is crucial. A more intricate model may require more data to avoid overfitting.
print(f'Decision Tree: {dtree_model.accuracy_train() * 100:.3f}%')
print(f'K Nearest Neighbors: {knear_model.accuracy_train() * 100:.3f}%')
Decision Tree: 100.000% K Nearest Neighbors: 93.495%
Results
As we can see, the result on train data is a little better than the result in test data but in overal they are st the same reange.
| Model | Accuracy |
|---|---|
| Decision Tree | 90.923% |
| K Nearest Neighbors | 88.095% |
| Logistic Regression | 87.649% |
The result changes after changing the threshold for removing columns with low correlations, but the total change is insignificant.
plot_tree(dtree_model.model_instance, filled=True, feature_names=x_train.columns, class_names=['Low', 'High'])
[Text(0.24743238369193502, 0.9772727272727273, 'MntMeatProducts <= -0.458\ngini = 0.5\nsamples = 1568\nvalue = [800, 768]\nclass = Low'), Text(0.12083214591051582, 0.9318181818181818, 'MntCoffee <= 0.057\ngini = 0.176\nsamples = 769\nvalue = [694, 75]\nclass = Low'), Text(0.11171273867198632, 0.8863636363636364, 'MntGoldProds <= 0.111\ngini = 0.079\nsamples = 724\nvalue = [694, 30]\nclass = Low'), Text(0.05927614705044172, 0.8409090909090909, 'MntSweetProducts <= 0.943\ngini = 0.04\nsamples = 685\nvalue = [671, 14]\nclass = Low'), Text(0.050156739811912224, 0.7954545454545454, 'MntCoffee <= -0.334\ngini = 0.035\nsamples = 683\nvalue = [671, 12]\nclass = Low'), Text(0.018238814477058992, 0.75, 'MntCoffee <= -0.414\ngini = 0.015\nsamples = 654\nvalue = [649, 5]\nclass = Low'), Text(0.009119407238529496, 0.7045454545454546, 'gini = 0.0\nsamples = 580\nvalue = [580, 0]\nclass = Low'), Text(0.027358221715588486, 0.7045454545454546, 'Income <= 0.444\ngini = 0.126\nsamples = 74\nvalue = [69, 5]\nclass = Low'), Text(0.018238814477058992, 0.6590909090909091, 'MntMeatProducts <= -0.569\ngini = 0.104\nsamples = 73\nvalue = [69, 4]\nclass = Low'), Text(0.009119407238529496, 0.6136363636363636, 'gini = 0.0\nsamples = 58\nvalue = [58, 0]\nclass = Low'), Text(0.027358221715588486, 0.6136363636363636, 'MntMeatProducts <= -0.529\ngini = 0.391\nsamples = 15\nvalue = [11, 4]\nclass = Low'), Text(0.018238814477058992, 0.5681818181818182, 'MntCoffee <= -0.394\ngini = 0.5\nsamples = 8\nvalue = [4, 4]\nclass = Low'), Text(0.009119407238529496, 0.5227272727272727, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = High'), Text(0.027358221715588486, 0.5227272727272727, 'MntFishProducts <= -0.66\ngini = 0.444\nsamples = 6\nvalue = [4, 2]\nclass = Low'), Text(0.018238814477058992, 0.4772727272727273, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.036477628954117984, 0.4772727272727273, 'MntCoffee <= -0.379\ngini = 0.32\nsamples = 5\nvalue = [4, 1]\nclass = Low'), Text(0.027358221715588486, 0.4318181818181818, 'MntMeatProducts <= -0.547\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = Low'), Text(0.018238814477058992, 0.38636363636363635, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.036477628954117984, 0.38636363636363635, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.045597036192647475, 0.4318181818181818, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = Low'), Text(0.036477628954117984, 0.5681818181818182, 'gini = 0.0\nsamples = 7\nvalue = [7, 0]\nclass = Low'), Text(0.036477628954117984, 0.6590909090909091, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.08207466514676547, 0.75, 'Kidhome <= 0.105\ngini = 0.366\nsamples = 29\nvalue = [22, 7]\nclass = Low'), Text(0.06383585066970647, 0.7045454545454546, 'MntSweetProducts <= -0.353\ngini = 0.111\nsamples = 17\nvalue = [16, 1]\nclass = Low'), Text(0.05471644343117697, 0.6590909090909091, 'gini = 0.0\nsamples = 16\nvalue = [16, 0]\nclass = Low'), Text(0.07295525790823597, 0.6590909090909091, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.10031347962382445, 0.7045454545454546, 'MntSweetProducts <= -0.559\ngini = 0.5\nsamples = 12\nvalue = [6, 6]\nclass = Low'), Text(0.09119407238529495, 0.6590909090909091, 'MntMeatProducts <= -0.656\ngini = 0.375\nsamples = 8\nvalue = [2, 6]\nclass = High'), Text(0.08207466514676547, 0.6136363636363636, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.10031347962382445, 0.6136363636363636, 'MntCoffee <= -0.274\ngini = 0.245\nsamples = 7\nvalue = [1, 6]\nclass = High'), Text(0.09119407238529495, 0.5681818181818182, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = High'), Text(0.10943288686235395, 0.5681818181818182, 'MntCoffee <= -0.249\ngini = 0.375\nsamples = 4\nvalue = [1, 3]\nclass = High'), Text(0.10031347962382445, 0.5227272727272727, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.11855229410088344, 0.5227272727272727, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = High'), Text(0.10943288686235395, 0.6590909090909091, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = Low'), Text(0.06839555428897122, 0.7954545454545454, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = High'), Text(0.16414933029353093, 0.8409090909090909, 'MntMeatProducts <= -0.498\ngini = 0.484\nsamples = 39\nvalue = [23, 16]\nclass = Low'), Text(0.15502992305500143, 0.7954545454545454, 'Income <= -0.104\ngini = 0.451\nsamples = 35\nvalue = [23, 12]\nclass = Low'), Text(0.12767170133941294, 0.75, 'NumWebVisitsMonth <= -1.45\ngini = 0.255\nsamples = 20\nvalue = [17, 3]\nclass = Low'), Text(0.11855229410088344, 0.7045454545454546, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.13679110857794244, 0.7045454545454546, 'MntGoldProds <= 0.14\ngini = 0.188\nsamples = 19\nvalue = [17, 2]\nclass = Low'), Text(0.12767170133941294, 0.6590909090909091, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.14591051581647194, 0.6590909090909091, 'MntFishProducts <= -0.019\ngini = 0.105\nsamples = 18\nvalue = [17, 1]\nclass = Low'), Text(0.13679110857794244, 0.6136363636363636, 'gini = 0.0\nsamples = 16\nvalue = [16, 0]\nclass = Low'), Text(0.15502992305500143, 0.6136363636363636, 'MntGoldProds <= 1.232\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = Low'), Text(0.14591051581647194, 0.5681818181818182, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.16414933029353093, 0.5681818181818182, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.1823881447705899, 0.75, 'MntFruits <= -0.184\ngini = 0.48\nsamples = 15\nvalue = [6, 9]\nclass = High'), Text(0.17326873753206043, 0.7045454545454546, 'MntFishProducts <= -0.586\ngini = 0.496\nsamples = 11\nvalue = [6, 5]\nclass = Low'), Text(0.16414933029353093, 0.6590909090909091, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = High'), Text(0.1823881447705899, 0.6590909090909091, 'Income <= -0.052\ngini = 0.375\nsamples = 8\nvalue = [6, 2]\nclass = Low'), Text(0.17326873753206043, 0.6136363636363636, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = High'), Text(0.1915075520091194, 0.6136363636363636, 'gini = 0.0\nsamples = 6\nvalue = [6, 0]\nclass = Low'), Text(0.1915075520091194, 0.7045454545454546, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = High'), Text(0.17326873753206043, 0.7954545454545454, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = High'), Text(0.1299515531490453, 0.8863636363636364, 'gini = 0.0\nsamples = 45\nvalue = [0, 45]\nclass = High'), Text(0.37403262147335425, 0.9318181818181818, 'MntCoffee <= -0.539\ngini = 0.23\nsamples = 799\nvalue = [106.0, 693.0]\nclass = High'), Text(0.24622399544029638, 0.8863636363636364, 'MntGoldProds <= -0.797\ngini = 0.452\nsamples = 55\nvalue = [36, 19]\nclass = Low'), Text(0.23710458820176689, 0.8409090909090909, 'gini = 0.0\nsamples = 5\nvalue = [0, 5]\nclass = High'), Text(0.2553434026788259, 0.8409090909090909, 'MntSweetProducts <= 1.222\ngini = 0.403\nsamples = 50\nvalue = [36.0, 14.0]\nclass = Low'), Text(0.24622399544029638, 0.7954545454545454, 'Kidhome <= 1.963\ngini = 0.34\nsamples = 46\nvalue = [36, 10]\nclass = Low'), Text(0.23710458820176689, 0.75, 'MntGoldProds <= 2.362\ngini = 0.298\nsamples = 44\nvalue = [36, 8]\nclass = Low'), Text(0.2279851809632374, 0.7045454545454546, 'MntMeatProducts <= 3.731\ngini = 0.273\nsamples = 43\nvalue = [36.0, 7.0]\nclass = Low'), Text(0.2188657737247079, 0.6590909090909091, 'MntFruits <= -0.649\ngini = 0.245\nsamples = 42\nvalue = [36, 6]\nclass = Low'), Text(0.2097463664861784, 0.6136363636363636, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.2279851809632374, 0.6136363636363636, 'Income <= -0.082\ngini = 0.214\nsamples = 41\nvalue = [36, 5]\nclass = Low'), Text(0.2188657737247079, 0.5681818181818182, 'Income <= -0.235\ngini = 0.33\nsamples = 24\nvalue = [19, 5]\nclass = Low'), Text(0.2006269592476489, 0.5227272727272727, 'MntFruits <= 0.194\ngini = 0.1\nsamples = 19\nvalue = [18, 1]\nclass = Low'), Text(0.1915075520091194, 0.4772727272727273, 'gini = 0.0\nsamples = 18\nvalue = [18, 0]\nclass = Low'), Text(0.2097463664861784, 0.4772727272727273, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.23710458820176689, 0.5227272727272727, 'MntGoldProds <= 0.334\ngini = 0.32\nsamples = 5\nvalue = [1, 4]\nclass = High'), Text(0.2279851809632374, 0.4772727272727273, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = High'), Text(0.24622399544029638, 0.4772727272727273, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.23710458820176689, 0.5681818181818182, 'gini = 0.0\nsamples = 17\nvalue = [17, 0]\nclass = Low'), Text(0.23710458820176689, 0.6590909090909091, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.24622399544029638, 0.7045454545454546, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.2553434026788259, 0.75, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = High'), Text(0.2644628099173554, 0.7954545454545454, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = High'), Text(0.5018412475064121, 0.8863636363636364, 'MntMeatProducts <= -0.379\ngini = 0.17\nsamples = 744\nvalue = [70, 674]\nclass = High'), Text(0.3328583642063266, 0.8409090909090909, 'MntCoffee <= -0.223\ngini = 0.398\nsamples = 62\nvalue = [17, 45]\nclass = High'), Text(0.2918210316329439, 0.7954545454545454, 'Income <= 0.18\ngini = 0.49\nsamples = 21\nvalue = [12, 9]\nclass = Low'), Text(0.2735822171558849, 0.75, 'MntGoldProds <= 0.092\ngini = 0.391\nsamples = 15\nvalue = [11, 4]\nclass = Low'), Text(0.2644628099173554, 0.7045454545454546, 'gini = 0.0\nsamples = 9\nvalue = [9, 0]\nclass = Low'), Text(0.2827016243944144, 0.7045454545454546, 'NumWebVisitsMonth <= 0.048\ngini = 0.444\nsamples = 6\nvalue = [2, 4]\nclass = High'), Text(0.2735822171558849, 0.6590909090909091, 'MntFruits <= 0.168\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = Low'), Text(0.2644628099173554, 0.6136363636363636, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = Low'), Text(0.2827016243944144, 0.6136363636363636, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.2918210316329439, 0.6590909090909091, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = High'), Text(0.31005984611000287, 0.75, 'MntMeatProducts <= -0.392\ngini = 0.278\nsamples = 6\nvalue = [1, 5]\nclass = High'), Text(0.30094043887147337, 0.7045454545454546, 'gini = 0.0\nsamples = 5\nvalue = [0, 5]\nclass = High'), Text(0.31917925334853237, 0.7045454545454546, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.3738956967797093, 0.7954545454545454, 'MntMeatProducts <= -0.401\ngini = 0.214\nsamples = 41\nvalue = [5, 36]\nclass = High'), Text(0.34653747506412086, 0.75, 'MntCoffee <= 1.135\ngini = 0.157\nsamples = 35\nvalue = [3, 32]\nclass = High'), Text(0.33741806782559136, 0.7045454545454546, 'gini = 0.0\nsamples = 24\nvalue = [0, 24]\nclass = High'), Text(0.3556568823026503, 0.7045454545454546, 'MntCoffee <= 1.495\ngini = 0.397\nsamples = 11\nvalue = [3, 8]\nclass = High'), Text(0.34653747506412086, 0.6590909090909091, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = Low'), Text(0.3647762895411798, 0.6590909090909091, 'gini = 0.0\nsamples = 8\nvalue = [0, 8]\nclass = High'), Text(0.4012539184952978, 0.75, 'NumWebVisitsMonth <= 0.477\ngini = 0.444\nsamples = 6\nvalue = [2, 4]\nclass = High'), Text(0.3921345112567683, 0.7045454545454546, 'MntSweetProducts <= -0.377\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = Low'), Text(0.3830151040182388, 0.6590909090909091, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = Low'), Text(0.4012539184952978, 0.6590909090909091, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.4103733257338273, 0.7045454545454546, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = High'), Text(0.6708241308064976, 0.8409090909090909, 'NumWebVisitsMonth <= -1.665\ngini = 0.143\nsamples = 682\nvalue = [53, 629]\nclass = High'), Text(0.4901681390709604, 0.7954545454545454, 'Income <= 0.553\ngini = 0.329\nsamples = 101\nvalue = [21, 80]\nclass = High'), Text(0.4468509546879453, 0.75, 'MntMeatProducts <= 0.902\ngini = 0.499\nsamples = 19\nvalue = [10, 9]\nclass = Low'), Text(0.4286121402108863, 0.7045454545454546, 'MntGoldProds <= 2.72\ngini = 0.18\nsamples = 10\nvalue = [9, 1]\nclass = Low'), Text(0.4194927329723568, 0.6590909090909091, 'gini = 0.0\nsamples = 9\nvalue = [9, 0]\nclass = Low'), Text(0.4377315474494158, 0.6590909090909091, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.4650897691650043, 0.7045454545454546, 'MntCoffee <= -0.328\ngini = 0.198\nsamples = 9\nvalue = [1, 8]\nclass = High'), Text(0.4559703619264748, 0.6590909090909091, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.47420917640353377, 0.6590909090909091, 'gini = 0.0\nsamples = 8\nvalue = [0, 8]\nclass = High'), Text(0.5334853234539755, 0.75, 'MntCoffee <= 0.736\ngini = 0.232\nsamples = 82\nvalue = [11, 71]\nclass = High'), Text(0.5015673981191222, 0.7045454545454546, 'MntMeatProducts <= 1.79\ngini = 0.081\nsamples = 47\nvalue = [2, 45]\nclass = High'), Text(0.49244799088059277, 0.6590909090909091, 'gini = 0.0\nsamples = 42\nvalue = [0, 42]\nclass = High'), Text(0.5106868053576518, 0.6590909090909091, 'MntMeatProducts <= 2.054\ngini = 0.48\nsamples = 5\nvalue = [2, 3]\nclass = High'), Text(0.5015673981191222, 0.6136363636363636, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = Low'), Text(0.5198062125961812, 0.6136363636363636, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = High'), Text(0.5654032487888287, 0.7045454545454546, 'MntMeatProducts <= 0.601\ngini = 0.382\nsamples = 35\nvalue = [9, 26]\nclass = High'), Text(0.5471644343117698, 0.6590909090909091, 'MntCoffee <= 1.449\ngini = 0.444\nsamples = 6\nvalue = [4, 2]\nclass = Low'), Text(0.5380450270732402, 0.6136363636363636, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = Low'), Text(0.5562838415502992, 0.6136363636363636, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = High'), Text(0.5836420632658877, 0.6590909090909091, 'MntMeatProducts <= 1.241\ngini = 0.285\nsamples = 29\nvalue = [5, 24]\nclass = High'), Text(0.5745226560273582, 0.6136363636363636, 'gini = 0.0\nsamples = 14\nvalue = [0, 14]\nclass = High'), Text(0.5927614705044172, 0.6136363636363636, 'MntFishProducts <= 1.171\ngini = 0.444\nsamples = 15\nvalue = [5, 10]\nclass = High'), Text(0.5745226560273582, 0.5681818181818182, 'MntCoffee <= 0.88\ngini = 0.18\nsamples = 10\nvalue = [1, 9]\nclass = High'), Text(0.5654032487888287, 0.5227272727272727, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.5836420632658877, 0.5227272727272727, 'gini = 0.0\nsamples = 9\nvalue = [0, 9]\nclass = High'), Text(0.6110002849814762, 0.5681818181818182, 'MntGoldProds <= -0.159\ngini = 0.32\nsamples = 5\nvalue = [4, 1]\nclass = Low'), Text(0.6018808777429467, 0.5227272727272727, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = Low'), Text(0.6201196922200057, 0.5227272727272727, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.8514801225420348, 0.7954545454545454, 'Income <= -0.887\ngini = 0.104\nsamples = 581\nvalue = [32, 549]\nclass = High'), Text(0.8024722143060701, 0.75, 'MntSweetProducts <= -0.474\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = Low'), Text(0.7933528070675406, 0.7045454545454546, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.8115916215445996, 0.7045454545454546, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.9004880307779994, 0.75, 'MntFruits <= 4.205\ngini = 0.101\nsamples = 579\nvalue = [31, 548]\nclass = High'), Text(0.8298304360216586, 0.7045454545454546, 'MntSweetProducts <= -0.232\ngini = 0.099\nsamples = 577\nvalue = [30, 547]\nclass = High'), Text(0.7158734682245654, 0.6590909090909091, 'MntFruits <= 3.413\ngini = 0.159\nsamples = 172\nvalue = [15, 157]\nclass = High'), Text(0.7067540609860359, 0.6136363636363636, 'MntCoffee <= 3.385\ngini = 0.15\nsamples = 171\nvalue = [14, 157]\nclass = High'), Text(0.6976346537475064, 0.5681818181818182, 'MntCoffee <= -0.269\ngini = 0.141\nsamples = 170\nvalue = [13.0, 157.0]\nclass = High'), Text(0.6383585066970647, 0.5227272727272727, 'NumWebVisitsMonth <= -0.594\ngini = 0.358\nsamples = 30\nvalue = [7, 23]\nclass = High'), Text(0.6292390994585352, 0.4772727272727273, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = Low'), Text(0.6474779139355942, 0.4772727272727273, 'MntMeatProducts <= -0.248\ngini = 0.293\nsamples = 28\nvalue = [5, 23]\nclass = High'), Text(0.6383585066970647, 0.4318181818181818, 'MntCoffee <= -0.366\ngini = 0.496\nsamples = 11\nvalue = [5, 6]\nclass = High'), Text(0.6201196922200057, 0.38636363636363635, 'MntMeatProducts <= -0.275\ngini = 0.278\nsamples = 6\nvalue = [1, 5]\nclass = High'), Text(0.6110002849814762, 0.3409090909090909, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = High'), Text(0.6292390994585352, 0.3409090909090909, 'MntSweetProducts <= -0.535\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = Low'), Text(0.6201196922200057, 0.29545454545454547, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.6383585066970647, 0.29545454545454547, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.6565973211741237, 0.38636363636363635, 'MntGoldProds <= 1.029\ngini = 0.32\nsamples = 5\nvalue = [4, 1]\nclass = Low'), Text(0.6474779139355942, 0.3409090909090909, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = Low'), Text(0.6657167284126532, 0.3409090909090909, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.6565973211741237, 0.4318181818181818, 'gini = 0.0\nsamples = 17\nvalue = [0, 17]\nclass = High'), Text(0.7569108007979481, 0.5227272727272727, 'MntFruits <= 1.149\ngini = 0.082\nsamples = 140\nvalue = [6, 134]\nclass = High'), Text(0.7204331718438302, 0.4772727272727273, 'Income <= 0.845\ngini = 0.046\nsamples = 127\nvalue = [3, 124]\nclass = High'), Text(0.7021943573667712, 0.4318181818181818, 'MntGoldProds <= -0.671\ngini = 0.018\nsamples = 112\nvalue = [1, 111]\nclass = High'), Text(0.6930749501282417, 0.38636363636363635, 'NumWebVisitsMonth <= -1.236\ngini = 0.278\nsamples = 6\nvalue = [1, 5]\nclass = High'), Text(0.6839555428897122, 0.3409090909090909, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.7021943573667712, 0.3409090909090909, 'gini = 0.0\nsamples = 5\nvalue = [0, 5]\nclass = High'), Text(0.7113137646053006, 0.38636363636363635, 'gini = 0.0\nsamples = 106\nvalue = [0, 106]\nclass = High'), Text(0.7386719863208892, 0.4318181818181818, 'Income <= 0.884\ngini = 0.231\nsamples = 15\nvalue = [2, 13]\nclass = High'), Text(0.7295525790823596, 0.38636363636363635, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.7477913935594186, 0.38636363636363635, 'MntMeatProducts <= -0.159\ngini = 0.133\nsamples = 14\nvalue = [1, 13]\nclass = High'), Text(0.7386719863208892, 0.3409090909090909, 'MntFishProducts <= -0.238\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = Low'), Text(0.7295525790823596, 0.29545454545454547, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.7477913935594186, 0.29545454545454547, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.7569108007979481, 0.3409090909090909, 'gini = 0.0\nsamples = 12\nvalue = [0, 12]\nclass = High'), Text(0.7933884297520661, 0.4772727272727273, 'MntMeatProducts <= 1.613\ngini = 0.355\nsamples = 13\nvalue = [3, 10]\nclass = High'), Text(0.7751496152750071, 0.4318181818181818, 'MntFruits <= 1.262\ngini = 0.18\nsamples = 10\nvalue = [1, 9]\nclass = High'), Text(0.7660302080364776, 0.38636363636363635, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.7842690225135366, 0.38636363636363635, 'gini = 0.0\nsamples = 9\nvalue = [0, 9]\nclass = High'), Text(0.8116272442291251, 0.4318181818181818, 'MntFruits <= 2.696\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = Low'), Text(0.8025078369905956, 0.38636363636363635, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = Low'), Text(0.8207466514676546, 0.38636363636363635, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.7158734682245654, 0.5681818181818182, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.7249928754630949, 0.6136363636363636, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.9437874038187518, 0.6590909090909091, 'MntSweetProducts <= 3.415\ngini = 0.071\nsamples = 405\nvalue = [15, 390]\nclass = High'), Text(0.9058136221145625, 0.6136363636363636, 'MntFruits <= 1.111\ngini = 0.06\nsamples = 388\nvalue = [12, 376]\nclass = High'), Text(0.8966942148760331, 0.5681818181818182, 'MntFruits <= 1.086\ngini = 0.087\nsamples = 262\nvalue = [12, 250]\nclass = High'), Text(0.8875748076375035, 0.5227272727272727, 'MntCoffee <= -0.388\ngini = 0.081\nsamples = 261\nvalue = [11, 250]\nclass = High'), Text(0.8389854659447136, 0.4772727272727273, 'Income <= 0.256\ngini = 0.32\nsamples = 10\nvalue = [2, 8]\nclass = High'), Text(0.8298660587061841, 0.4318181818181818, 'gini = 0.0\nsamples = 7\nvalue = [0, 7]\nclass = High'), Text(0.8481048731832431, 0.4318181818181818, 'MntFruits <= 0.143\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = Low'), Text(0.8389854659447136, 0.38636363636363635, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.8572242804217726, 0.38636363636363635, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = Low'), Text(0.9361641493302936, 0.4772727272727273, 'MntGoldProds <= 3.0\ngini = 0.069\nsamples = 251\nvalue = [9, 242]\nclass = High'), Text(0.9068110572812768, 0.4318181818181818, 'Income <= 1.507\ngini = 0.062\nsamples = 248\nvalue = [8, 240]\nclass = High'), Text(0.8754630948988316, 0.38636363636363635, 'MntFruits <= 0.86\ngini = 0.05\nsamples = 236\nvalue = [6, 230]\nclass = High'), Text(0.8401253918495298, 0.3409090909090909, 'MntCoffee <= 0.533\ngini = 0.035\nsamples = 222\nvalue = [4, 218]\nclass = High'), Text(0.8310059846110003, 0.29545454545454547, 'gini = 0.0\nsamples = 105\nvalue = [0, 105]\nclass = High'), Text(0.8492447990880593, 0.29545454545454547, 'MntCoffee <= 0.546\ngini = 0.066\nsamples = 117\nvalue = [4, 113]\nclass = High'), Text(0.8241664291821031, 0.25, 'MntFishProducts <= 1.748\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = Low'), Text(0.8150470219435737, 0.20454545454545456, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.8332858364206327, 0.20454545454545456, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.8743231689940154, 0.25, 'NumWebVisitsMonth <= 0.905\ngini = 0.051\nsamples = 115\nvalue = [3, 112]\nclass = High'), Text(0.8515246508976917, 0.20454545454545456, 'NumWebVisitsMonth <= -0.808\ngini = 0.037\nsamples = 107\nvalue = [2.0, 105.0]\nclass = High'), Text(0.8424052436591621, 0.1590909090909091, 'MntCoffee <= 0.726\ngini = 0.087\nsamples = 44\nvalue = [2, 42]\nclass = High'), Text(0.8241664291821031, 0.11363636363636363, 'MntMeatProducts <= 0.326\ngini = 0.375\nsamples = 4\nvalue = [1, 3]\nclass = High'), Text(0.8150470219435737, 0.06818181818181818, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.8332858364206327, 0.06818181818181818, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = High'), Text(0.8606440581362211, 0.11363636363636363, 'MntSweetProducts <= 0.059\ngini = 0.049\nsamples = 40\nvalue = [1, 39]\nclass = High'), Text(0.8515246508976917, 0.06818181818181818, 'MntGoldProds <= -0.072\ngini = 0.32\nsamples = 5\nvalue = [1, 4]\nclass = High'), Text(0.8424052436591621, 0.022727272727272728, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = High'), Text(0.8606440581362211, 0.022727272727272728, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.8697634653747507, 0.06818181818181818, 'gini = 0.0\nsamples = 35\nvalue = [0, 35]\nclass = High'), Text(0.8606440581362211, 0.1590909090909091, 'gini = 0.0\nsamples = 63\nvalue = [0, 63]\nclass = High'), Text(0.8971216870903391, 0.20454545454545456, 'MntMeatProducts <= -0.071\ngini = 0.219\nsamples = 8\nvalue = [1, 7]\nclass = High'), Text(0.8880022798518097, 0.1590909090909091, 'MntCoffee <= 1.637\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = Low'), Text(0.8788828726132801, 0.11363636363636363, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.8971216870903391, 0.11363636363636363, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.9062410943288686, 0.1590909090909091, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = High'), Text(0.9108007979481334, 0.3409090909090909, 'MntCoffee <= 0.357\ngini = 0.245\nsamples = 14\nvalue = [2, 12]\nclass = High'), Text(0.9016813907096038, 0.29545454545454547, 'MntFishProducts <= 1.949\ngini = 0.444\nsamples = 6\nvalue = [2, 4]\nclass = High'), Text(0.8925619834710744, 0.25, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = Low'), Text(0.9108007979481334, 0.25, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = High'), Text(0.9199202051866628, 0.29545454545454547, 'gini = 0.0\nsamples = 8\nvalue = [0, 8]\nclass = High'), Text(0.9381590196637218, 0.38636363636363635, 'MntFishProducts <= -0.477\ngini = 0.278\nsamples = 12\nvalue = [2, 10]\nclass = High'), Text(0.9290396124251924, 0.3409090909090909, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.9472784269022514, 0.3409090909090909, 'Kidhome <= 0.105\ngini = 0.165\nsamples = 11\nvalue = [1, 10]\nclass = High'), Text(0.9381590196637218, 0.29545454545454547, 'gini = 0.0\nsamples = 10\nvalue = [0, 10]\nclass = High'), Text(0.9563978341407808, 0.29545454545454547, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.9655172413793104, 0.4318181818181818, 'MntCoffee <= 0.14\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = High'), Text(0.9563978341407808, 0.38636363636363635, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.9746366486178398, 0.38636363636363635, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = High'), Text(0.9058136221145625, 0.5227272727272727, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.9149330293530921, 0.5681818181818182, 'gini = 0.0\nsamples = 126\nvalue = [0, 126]\nclass = High'), Text(0.981761185522941, 0.6136363636363636, 'Income <= 0.691\ngini = 0.291\nsamples = 17\nvalue = [3, 14]\nclass = High'), Text(0.9726417782844116, 0.5681818181818182, 'MntSweetProducts <= 3.754\ngini = 0.49\nsamples = 7\nvalue = [3, 4]\nclass = High'), Text(0.963522371045882, 0.5227272727272727, 'MntGoldProds <= 1.493\ngini = 0.375\nsamples = 4\nvalue = [3, 1]\nclass = Low'), Text(0.9544029638073526, 0.4772727272727273, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = Low'), Text(0.9726417782844116, 0.4772727272727273, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'), Text(0.981761185522941, 0.5227272727272727, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = High'), Text(0.9908805927614706, 0.5681818181818182, 'gini = 0.0\nsamples = 10\nvalue = [0, 10]\nclass = High'), Text(0.9711456255343403, 0.7045454545454546, 'MntMeatProducts <= 0.166\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = Low'), Text(0.9620262182958108, 0.6590909090909091, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'), Text(0.9802650327728698, 0.6590909090909091, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High')]
Ensemble Learning Methods¶
class RandForestClassifier:
def __init__(self, x_train, y_train, x_test, y_test, n_estimators: int = 100, max_depth: int = 8):
self.x_train = x_train
self.y_train = y_train
self.x_test = x_test
self.y_test = y_test
self.n_estimators = n_estimators
self.max_depth = max_depth
self.set_params()
def set_params(self):
self.randf = RandomForestClassifier(criterion='entropy',
max_depth=self.max_depth,
n_estimators=self.n_estimators,
random_state=1)
self.randf.fit(self.x_train, self.y_train)
self.randf_predict = self.randf.predict(self.x_test)
def accuracy_test(self) -> float:
return metrics.accuracy_score(self.y_test, self.randf_predict)
def accuracy_train(self) -> float:
train_predict = self.randf.predict(self.x_train)
return metrics.accuracy_score(self.y_train, train_predict)
def confusion_matrix(self):
matrix_randf = metrics.confusion_matrix(self.y_test, self.randf_predict)
matrix_disp = metrics.ConfusionMatrixDisplay(matrix_randf)
matrix_disp.plot(cmap='Blues')
plt.grid(False)
plt.title('Random Forest Confusion Matrix')
plt.show()
def grid_search(self) -> tuple[float, GridSearchCV]:
search_params = {
'n_estimators': range(70, 230, 30),
'criterion': ['entropy'],
'max_depth': range(4, 10),
'random_state': [1]
}
grid = GridSearchCV(self.randf, search_params, scoring='accuracy', n_jobs=2)
grid.fit(self.x_train, self.y_train)
test_score = grid.score(self.x_test, self.y_test)
return test_score, grid
randf_model = RandForestClassifier(x_train, y_train, x_test, y_test)
print(f'Random Forest Train Acc: {randf_model.accuracy_train() * 100:.3f}%')
print(f'Random Forest Test Acc: {randf_model.accuracy_test() * 100:.3f}%')
Random Forest Train Acc: 96.556% Random Forest Test Acc: 92.560%
randf_model.confusion_matrix()
grid_res = randf_model.grid_search()
print(f'Random Forest\nTest Score: {grid_res[0]}\nParams: {grid_res[1].best_params_}')
Random Forest
Test Score: 0.9241071428571429
Params: {'criterion': 'entropy', 'max_depth': 7, 'n_estimators': 100, 'random_state': 1}
Hyperparameters
def n_estimators_effects():
n_estimators = range(1, 200, 10)
train_res = []
test_res = []
for est_count in n_estimators:
randf = RandForestClassifier(x_train, y_train, x_test, y_test, n_estimators=est_count)
test_res.append(randf.accuracy_test())
train_res.append(randf.accuracy_train())
plt.plot(n_estimators, test_res)
plt.plot(n_estimators, train_res)
plt.xlabel('n_estimators')
plt.ylabel('Accuracy')
plt.legend(loc="lower right")
def max_depth_effects():
max_depths = range(1, 20)
train_res = []
test_res = []
for depth in max_depths:
randf = RandForestClassifier(x_train, y_train, x_test, y_test, max_depth=depth)
test_res.append(randf.accuracy_test())
train_res.append(randf.accuracy_train())
plt.plot(max_depths, test_res)
plt.plot(max_depths, train_res)
plt.xlabel('max_depth')
plt.ylabel('Accuracy')
plt.legend(loc="lower right")
plt.figure(figsize=(10, 4))
plt.suptitle('Hyperparameter Effects on Accuracy on Random Forest Classifier')
plt.subplot(1, 2, 1)
n_estimators_effects()
plt.subplot(1, 2, 2)
max_depth_effects()
C:\Users\ASUS\AppData\Local\Temp\ipykernel_22524\1355520063.py:14: UserWarning: No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. plt.legend(loc="lower right") C:\Users\ASUS\AppData\Local\Temp\ipykernel_22524\1355520063.py:30: UserWarning: No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument. plt.legend(loc="lower right")
Comparative Analysis of Bias and Variance
dtree = DecisionTreeClassifier(** dtree_params)
_, dtree_bias, dtree_var = bias_variance_decomp(
dtree,
x_train.values, y_train.values,
x_test.values, y_test.values,
loss='mse', random_seed=10
)
_, randf_bias, randf_var = bias_variance_decomp(randf_model.randf,
x_train.values, y_train.values,
x_test.values, y_test.values,
loss='mse', random_seed=1)
display(HTML(f'<b>Decision Tree Bias:</b> {dtree_bias:.3f}'))
display(HTML(f'<b>Decision Tree Variance:</b> {dtree_var:.3f}'))
display(HTML(f'<b>Random Forest Bias:</b> {randf_bias:.3f}'))
display(HTML(f'<b>Random Forest Variance:</b> {randf_var:.3f}'))
The comparison here is between the Decision Tree model and the Random Forest model.
Bias refers to the difference between the average prediction of a model and the actual value we aim to predict. High bias can lead to underfitting, where the model fails to capture the underlying relationships between features and target outputs.
Variance measures the variability of model predictions for a given data point, indicating the spread of the data. High variance can lead to overfitting, where the model learns the noise in the training data instead of the true signal.
Random Forests address these issues by using an ensemble of multiple Decision Trees. This approach helps to
- reduce variance
- improve generalization
In contrast, Decision Trees alone tend to overfit the training data and typically exhibit
- higher variance
- lower bias
plot_tree(dtree) #This is best dtree with best_params
[Text(0.41304347826086957, 0.9, 'x[4] <= -0.458\nentropy = 0.999\nsamples = 1568\nvalue = [812.0, 756.0]'), Text(0.17391304347826086, 0.7, 'x[2] <= -0.414\nentropy = 0.474\nsamples = 798\nvalue = [717, 81]'), Text(0.08695652173913043, 0.5, 'x[7] <= 0.421\nentropy = 0.032\nsamples = 608\nvalue = [606, 2]'), Text(0.043478260869565216, 0.3, 'entropy = 0.0\nsamples = 600\nvalue = [600, 0]'), Text(0.13043478260869565, 0.3, 'entropy = 0.811\nsamples = 8\nvalue = [6, 2]'), Text(0.2608695652173913, 0.5, 'x[2] <= 0.06\nentropy = 0.979\nsamples = 190\nvalue = [111.0, 79.0]'), Text(0.21739130434782608, 0.3, 'x[4] <= -0.625\nentropy = 0.803\nsamples = 147\nvalue = [111.0, 36.0]'), Text(0.17391304347826086, 0.1, 'entropy = 0.116\nsamples = 64\nvalue = [63, 1]'), Text(0.2608695652173913, 0.1, 'entropy = 0.982\nsamples = 83\nvalue = [48, 35]'), Text(0.30434782608695654, 0.3, 'entropy = 0.0\nsamples = 43\nvalue = [0, 43]'), Text(0.6521739130434783, 0.7, 'x[2] <= -0.362\nentropy = 0.539\nsamples = 770\nvalue = [95, 675]'), Text(0.4782608695652174, 0.5, 'x[4] <= -0.368\nentropy = 0.997\nsamples = 103\nvalue = [48, 55]'), Text(0.391304347826087, 0.3, 'x[2] <= -0.512\nentropy = 0.746\nsamples = 33\nvalue = [26, 7]'), Text(0.34782608695652173, 0.1, 'entropy = 0.439\nsamples = 22\nvalue = [20, 2]'), Text(0.43478260869565216, 0.1, 'entropy = 0.994\nsamples = 11\nvalue = [6, 5]'), Text(0.5652173913043478, 0.3, 'x[6] <= 0.859\nentropy = 0.898\nsamples = 70\nvalue = [22, 48]'), Text(0.5217391304347826, 0.1, 'entropy = 0.958\nsamples = 58\nvalue = [22, 36]'), Text(0.6086956521739131, 0.1, 'entropy = 0.0\nsamples = 12\nvalue = [0, 12]'), Text(0.8260869565217391, 0.5, 'x[8] <= -1.236\nentropy = 0.368\nsamples = 667\nvalue = [47, 620]'), Text(0.7391304347826086, 0.3, 'x[6] <= 0.217\nentropy = 0.577\nsamples = 211\nvalue = [29, 182]'), Text(0.6956521739130435, 0.1, 'entropy = 0.908\nsamples = 65\nvalue = [21, 44]'), Text(0.782608695652174, 0.1, 'entropy = 0.306\nsamples = 146\nvalue = [8, 138]'), Text(0.9130434782608695, 0.3, 'x[4] <= -0.166\nentropy = 0.24\nsamples = 456\nvalue = [18, 438]'), Text(0.8695652173913043, 0.1, 'entropy = 0.506\nsamples = 125\nvalue = [14, 111]'), Text(0.9565217391304348, 0.1, 'entropy = 0.094\nsamples = 331\nvalue = [4, 327]')]
Differential privacy¶
Adding noise to a dataset is a technique for data anonymization, which enhances privacy and security.
It makes identifying individuals in the dataset difficult and decreases risks of re-identification and inference attacks. However, it may reduce data quality and utility.
The impact of noise addition varies based on the specific tasks and applications for which the data is used.
If noise addition follows a differential privacy framework, it guarantees privacy protection. Balancing privacy preservation and data utility is crucial.
Common methods fo add noise to data:
- Pulsed Noise
- Introduces abrupt changes resembling pulses, often used to simulate sudden disturbances
- Gaussian Noise
- Adds smooth, continuous variation following a normal distribution, commonly used in statistical modeling and simulations for a more natural variation.
sensitivity_range = [x / 10.0 for x in range(1, 30, 1)]
def noise_gen(data, sensitivity, eps=1.0):
return np.random.laplace(0, scale=sensitivity / eps, size=data.shape)
def add_noise():
nx_train=[]
nx_test=[]
for sensitivity in sensitivity_range:
nx_train.append(x_train + noise_gen(x_train, sensitivity))
nx_test.append(x_test + noise_gen(x_test, sensitivity))
return nx_train, nx_test
noisy_X_trains, noisy_X_tests = add_noise()
def plot_accuracy_vs_sensitivity(class_type, nX_trains, y_train, nX_tests, y_test):
scores = [0] * len(sensitivity_range)
for i in range(len(sensitivity_range)):
DT = Classifier(class_type, nX_trains[i], y_train, nX_tests[i], y_test)
scores[i] = DT.accuracy_test()
plt.plot(sensitivity_range, scores)
plt.xlabel('sensitivity')
plt.ylabel('Accuracy')
plt.title('Decision Tree Accuracy with noise')
plt.show()
plot_accuracy_vs_sensitivity(
DecisionTreeClassifier,
noisy_X_trains, y_train,
noisy_X_tests, y_test
)
plot_accuracy_vs_sensitivity(
KNeighborsClassifier,
noisy_X_trains, y_train,
noisy_X_tests, y_test
)
plot_accuracy_vs_sensitivity(
LogisticRegression,
noisy_X_trains, y_train,
noisy_X_tests, y_test
)
noisy_dtree_model = Classifier(DecisionTreeClassifier, noisy_X_trains[3], y_train, noisy_X_tests[3], y_test)
noisy_knear_model = Classifier(KNeighborsClassifier, noisy_X_trains[3], y_train, noisy_X_tests[3], y_test)
noisy_logreg_model = Classifier(LogisticRegression, noisy_X_trains[3], y_train, noisy_X_tests[3], y_test)
# Print accuracy on the noisy dataset
print(f'Noisy Decision Tree: {noisy_dtree_model.accuracy_test() * 100:.3f}%')
print(f'Noisy K Nearest Neighbors: {noisy_knear_model.accuracy_test() * 100:.3f}%')
print(f'Noisy Logistic Regression: {noisy_logreg_model.accuracy_test() * 100:.3f}%')
Noisy Decision Tree: 78.423% Noisy K Nearest Neighbors: 83.929% Noisy Logistic Regression: 86.607%
# Compare confusion matrices
noisy_dtree_model.confusion_matrix()
noisy_knear_model.confusion_matrix()
noisy_logreg_model.confusion_matrix()
Gradient-boosting¶
Gradient boosting is an ensemble learning technique used for both regression and classification problems.
It builds a series of weak learners, typically decision trees, sequentially.
Each new tree corrects the errors of the combined ensemble of the existing trees.
The predictions of each tree are weighted based on their performance, with more accurate trees receiving higher weights.
Working Process
- Initial Model:
Start with a simple model, often a shallow decision tree. - Residuals Calculation:
Calculate the residuals (differences between predictions and actual values) for the current model. - Next Model:
Build a new weak learner (tree) that focuses on minimizing the residuals of the previous model. - Weighted Combination:
Combine the new model with the previous ones, giving more weight to accurate models and less weight to less accurate ones. - Iteration:
Repeat steps 2-4 until a predefined number of models are created or until a specified level of performance is achieved.
Difference Between Boosting Trees and Decision Trees
As the Decision Trees are a simple and interpretable model that can be used for both regression and classification tasks. They work by recursively splitting the data into subsets based on the values of the input variables, until a stopping criterion is met. The resulting tree structure can be visualized and interpreted, making it easy to understand how the model is making predictions.
- Gradient Boosting is an ensemble method that combines multiple decision trees to make predictions, while Decision Trees are standalone models that make predictions based on a single tree.
- Gradient Boosting can handle non-linear relationships between the input variables and the output variable, while Decision Trees are limited to linear relationships.
- Gradient Boosting can also handle missing data and outliers more effectively than Decision Trees, as it is less prone to overfitting.
XGBoost and its functionality¶
Extreme Gradient Boosting (XGBoost) is a popular implementation fo the Gradient Boosting algorithm. It is designed to be highly scalable and efficient.
Its gradient boosting technique minimizes a loss function, which measures how well the model can predict the target variable.
One of XGBoost's key features is its ability to handle missing data and outliers. It does this by using regularization, which penalizes complex models and encourages simpler models that are less likely to overfit the data.
Its pruning technique removes branches of the decision tree that do not contribute to the model's overall accuracy.
Another essential feature of XGBoost is its ability to handle sparse and dense data. It does this by using a technique called sparsity-aware split finding, which can more efficiently handle missing values and zero values in sparse data than traditional split finding algorithms.
Some features of XGBoost:
- Support for custom loss functions
- Early stopping to prevent overfitting
- The ability to handle multi-class classification problems
This algorithm is used in a wide rage of machine learning tasks, including regression, classification, and ranking.
XGBoost algorithm¶
Important Hyperparameters for XGBoost
| Hyperparameter | Description |
|---|---|
| max_depth | The maximum depth of each decision tree in the ensemble. Increasing this value can make the model more complex and potentially more accurate but may also increase the risk of overfitting. |
| learning_rate | The step size is used to update the model weights during each iteration. A lower learning rate can make the model more conservative and less prone to overfitting but may also require more iterations to converge. |
| n_estimators | The number of decision trees in the ensemble. Increasing this value can make the model more accurate, but may also increase the risk of overfitting and make the model slower to train. |
| subsample | The fraction of the training data used to train each decision tree. Setting this value to less than 1.0 can make the model more robust to noise and reduce overfitting. |
| colsample_bytree | The fraction of the features used to train each decision tree. Setting this value to less than 1.0 can make the model more robust to noise and reduce overfitting. |
| gamma | The minimum reduction in the loss function required to make a split at a node. Increasing this value can make the model more conservative and less prone to overfitting. |
| reg_alpha | L1 regularization term on weights. Increasing this value can make the model more conservative and less prone to overfitting. |
| reg_lambda | L2 regularization term on weights. Increasing this value can make the model more conservative and less prone to overfitting. |
xgb = XGBClassifier()
params = {
'max_depth': range(3,11,2),
'learning_rate': [0.01, 0.03, 0.09, 0.1],
'n_estimators': range(20,201,30),
'gamma': [0, 1, 10, 100]
# 'max_depth': [3, 5, 7],
# 'learning_rate': [0.1, 0.01],
# 'n_estimators': [50, 100, 200],
# # 'subsample': [0.5, 0.7, 1],
# # 'colsample_bytree': [0.7, 1.0],
# 'gamma': [0, 1, 5]
}
xgb_grid_search = GridSearchCV(estimator=xgb, param_grid=params, cv=5, scoring="accuracy")
xgb_grid_search.fit(x_train, y_train)
print(f"Best Accuracy: {xgb_grid_search.best_score_ * 100:2.2f}%")
print(f"Best Parameters: {xgb_grid_search.best_params_}")
print(f"Gradient-boosting Accuracy: {xgb_grid_search.score(x_test, y_test) * 100:2.2f}%")
Best Accuracy: 93.18%
Best Parameters: {'gamma': 1, 'learning_rate': 0.03, 'max_depth': 5, 'n_estimators': 200}
Gradient-boosting Accuracy: 92.56%
y_pred = xgb_grid_search.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['0', '1'], columns=['0', '1'])
sns.heatmap(cm_df, annot=True, cmap='Blues', fmt='g')
plt.title('XGBoost Confusion Matrix')
plt.show()